blob: 9a76d3583e19eb294518a94b11beb2b8e547e222 [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Fred Drake785d14f2000-05-09 19:54:43 +00004modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
Guido van Rossumd57fd912000-03-10 22:53:23 +00005Unicode Integration Proposal (see file Misc/unicode.txt).
6
Guido van Rossum16b1ad92000-08-03 16:24:25 +00007Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +00008
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +00009--------------------------------------------------------------------
10The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000011
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000012 Copyright (c) 1999 by Secret Labs AB
13 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000014
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000015By obtaining, using, and/or copying this software and/or its
16associated documentation, you agree that you have read, understood,
17and will comply with the following terms and conditions:
18
19Permission to use, copy, modify, and distribute this software and its
20associated documentation for any purpose and without fee is hereby
21granted, provided that the above copyright notice appears in all
22copies, and that both that copyright notice and this permission notice
23appear in supporting documentation, and that the name of Secret Labs
24AB or the author not be used in advertising or publicity pertaining to
25distribution of the software without specific, written prior
26permission.
27
28SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
29THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
30FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
31ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
32WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
33ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
34OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
35--------------------------------------------------------------------
36
37*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000038
Martin v. Löwis5cb69362006-04-14 09:08:42 +000039#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000040#include "Python.h"
41
Guido van Rossumd57fd912000-03-10 22:53:23 +000042#include "unicodeobject.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000043#include "ucnhash.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000044
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000045#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000046#include <windows.h>
47#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000048
Guido van Rossumd57fd912000-03-10 22:53:23 +000049/* Limit for the Unicode object free list */
50
51#define MAX_UNICODE_FREELIST_SIZE 1024
52
53/* Limit for the Unicode object free list stay alive optimization.
54
55 The implementation will keep allocated Unicode memory intact for
56 all objects on the free list having a size less than this
Tim Petersced69f82003-09-16 20:30:58 +000057 limit. This reduces malloc() overhead for small Unicode objects.
Guido van Rossumd57fd912000-03-10 22:53:23 +000058
Barry Warsaw51ac5802000-03-20 16:36:48 +000059 At worst this will result in MAX_UNICODE_FREELIST_SIZE *
Guido van Rossumfd4b9572000-04-10 13:51:10 +000060 (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
Guido van Rossumd57fd912000-03-10 22:53:23 +000061 malloc()-overhead) bytes of unused garbage.
62
63 Setting the limit to 0 effectively turns the feature off.
64
Guido van Rossumfd4b9572000-04-10 13:51:10 +000065 Note: This is an experimental feature ! If you get core dumps when
66 using Unicode objects, turn this feature off.
Guido van Rossumd57fd912000-03-10 22:53:23 +000067
68*/
69
Guido van Rossumfd4b9572000-04-10 13:51:10 +000070#define KEEPALIVE_SIZE_LIMIT 9
Guido van Rossumd57fd912000-03-10 22:53:23 +000071
72/* Endianness switches; defaults to little endian */
73
74#ifdef WORDS_BIGENDIAN
75# define BYTEORDER_IS_BIG_ENDIAN
76#else
77# define BYTEORDER_IS_LITTLE_ENDIAN
78#endif
79
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000080/* --- Globals ------------------------------------------------------------
81
82 The globals are initialized by the _PyUnicode_Init() API and should
83 not be used before calling that API.
84
85*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000086
Anthony Baxterac6bd462006-04-13 02:06:09 +000087
88#ifdef __cplusplus
89extern "C" {
90#endif
91
Guido van Rossumd57fd912000-03-10 22:53:23 +000092/* Free list for Unicode objects */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000093static PyUnicodeObject *unicode_freelist;
94static int unicode_freelist_size;
Guido van Rossumd57fd912000-03-10 22:53:23 +000095
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000096/* The empty Unicode object is shared to improve performance. */
97static PyUnicodeObject *unicode_empty;
98
99/* Single character Unicode strings in the Latin-1 range are being
100 shared as well. */
101static PyUnicodeObject *unicode_latin1[256];
102
Fred Drakee4315f52000-05-09 19:53:39 +0000103/* Default encoding to use and assume when NULL is passed as encoding
104 parameter; it is initialized by _PyUnicode_Init().
105
106 Always use the PyUnicode_SetDefaultEncoding() and
Tim Petersced69f82003-09-16 20:30:58 +0000107 PyUnicode_GetDefaultEncoding() APIs to access this global.
Fred Drakee4315f52000-05-09 19:53:39 +0000108
109*/
Fred Drakee4315f52000-05-09 19:53:39 +0000110static char unicode_default_encoding[100];
111
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000112Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000113PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000114{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000115#ifdef Py_UNICODE_WIDE
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000116 return 0x10FFFF;
117#else
118 /* This is actually an illegal character, so it should
119 not be passed to unichr. */
120 return 0xFFFF;
121#endif
122}
123
Guido van Rossumd57fd912000-03-10 22:53:23 +0000124/* --- Unicode Object ----------------------------------------------------- */
125
126static
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000127int unicode_resize(register PyUnicodeObject *unicode,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000128 Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000129{
130 void *oldstr;
Tim Petersced69f82003-09-16 20:30:58 +0000131
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000132 /* Shortcut if there's nothing much to do. */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000133 if (unicode->length == length)
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000134 goto reset;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000135
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000136 /* Resizing shared object (unicode_empty or single character
137 objects) in-place is not allowed. Use PyUnicode_Resize()
138 instead ! */
Martin v. Löwis80d2e592006-04-13 06:06:08 +0000139 if (unicode == unicode_empty ||
140 (unicode->length == 1 &&
141 unicode->str[0] < 256U &&
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000142 unicode_latin1[unicode->str[0]] == unicode)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000143 PyErr_SetString(PyExc_SystemError,
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000144 "can't resize shared unicode objects");
Guido van Rossumd57fd912000-03-10 22:53:23 +0000145 return -1;
146 }
147
148 /* We allocate one more byte to make sure the string is
149 Ux0000 terminated -- XXX is this needed ? */
150 oldstr = unicode->str;
151 PyMem_RESIZE(unicode->str, Py_UNICODE, length + 1);
152 if (!unicode->str) {
Anthony Baxtera6286212006-04-11 07:42:36 +0000153 unicode->str = (Py_UNICODE *)oldstr;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000154 PyErr_NoMemory();
155 return -1;
156 }
157 unicode->str[length] = 0;
Martin v. Löwis412fb672006-04-13 06:34:32 +0000158 unicode->length = length;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000159
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000160 reset:
Guido van Rossumd57fd912000-03-10 22:53:23 +0000161 /* Reset the object caches */
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000162 if (unicode->defenc) {
163 Py_DECREF(unicode->defenc);
164 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000165 }
166 unicode->hash = -1;
Tim Petersced69f82003-09-16 20:30:58 +0000167
Guido van Rossumd57fd912000-03-10 22:53:23 +0000168 return 0;
169}
170
171/* We allocate one more byte to make sure the string is
Tim Petersced69f82003-09-16 20:30:58 +0000172 Ux0000 terminated -- XXX is this needed ?
Guido van Rossumd57fd912000-03-10 22:53:23 +0000173
174 XXX This allocator could further be enhanced by assuring that the
175 free list never reduces its size below 1.
176
177*/
178
179static
Martin v. Löwis18e16552006-02-15 17:27:45 +0000180PyUnicodeObject *_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000181{
182 register PyUnicodeObject *unicode;
183
Tim Petersced69f82003-09-16 20:30:58 +0000184 /* Optimization fo empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000185 if (length == 0 && unicode_empty != NULL) {
186 Py_INCREF(unicode_empty);
187 return unicode_empty;
188 }
189
190 /* Unicode freelist & memory allocation */
191 if (unicode_freelist) {
192 unicode = unicode_freelist;
Marc-André Lemburgbea47e72000-06-17 20:31:17 +0000193 unicode_freelist = *(PyUnicodeObject **)unicode;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000194 unicode_freelist_size--;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000195 if (unicode->str) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000196 /* Keep-Alive optimization: we only upsize the buffer,
197 never downsize it. */
Tim Petersced69f82003-09-16 20:30:58 +0000198 if ((unicode->length < length) &&
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +0000199 unicode_resize(unicode, length) < 0) {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000200 PyMem_DEL(unicode->str);
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000201 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000202 }
203 }
Guido van Rossumad98db12001-06-14 17:52:02 +0000204 else {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000205 unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
Guido van Rossumad98db12001-06-14 17:52:02 +0000206 }
207 PyObject_INIT(unicode, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000208 }
209 else {
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000210 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000211 if (unicode == NULL)
212 return NULL;
213 unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
214 }
215
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000216 if (!unicode->str) {
217 PyErr_NoMemory();
Barry Warsaw51ac5802000-03-20 16:36:48 +0000218 goto onError;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000219 }
Jeremy Hyltond8082792003-09-16 19:41:39 +0000220 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +0000221 * the caller fails before initializing str -- unicode_resize()
222 * reads str[0], and the Keep-Alive optimization can keep memory
223 * allocated for str alive across a call to unicode_dealloc(unicode).
224 * We don't want unicode_resize to read uninitialized memory in
225 * that case.
226 */
Jeremy Hyltond8082792003-09-16 19:41:39 +0000227 unicode->str[0] = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000228 unicode->str[length] = 0;
Martin v. Löwisf15da692006-04-13 07:24:50 +0000229 unicode->length = length;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000230 unicode->hash = -1;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000231 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000232 return unicode;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000233
234 onError:
235 _Py_ForgetReference((PyObject *)unicode);
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000236 PyObject_Del(unicode);
Barry Warsaw51ac5802000-03-20 16:36:48 +0000237 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000238}
239
240static
Guido van Rossum9475a232001-10-05 20:51:39 +0000241void unicode_dealloc(register PyUnicodeObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000242{
Guido van Rossum604ddf82001-12-06 20:03:56 +0000243 if (PyUnicode_CheckExact(unicode) &&
244 unicode_freelist_size < MAX_UNICODE_FREELIST_SIZE) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000245 /* Keep-Alive optimization */
246 if (unicode->length >= KEEPALIVE_SIZE_LIMIT) {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000247 PyMem_DEL(unicode->str);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000248 unicode->str = NULL;
249 unicode->length = 0;
250 }
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000251 if (unicode->defenc) {
252 Py_DECREF(unicode->defenc);
253 unicode->defenc = NULL;
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000254 }
255 /* Add to free list */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000256 *(PyUnicodeObject **)unicode = unicode_freelist;
257 unicode_freelist = unicode;
258 unicode_freelist_size++;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000259 }
260 else {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000261 PyMem_DEL(unicode->str);
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000262 Py_XDECREF(unicode->defenc);
Guido van Rossum604ddf82001-12-06 20:03:56 +0000263 unicode->ob_type->tp_free((PyObject *)unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000264 }
265}
266
Martin v. Löwis18e16552006-02-15 17:27:45 +0000267int PyUnicode_Resize(PyObject **unicode, Py_ssize_t length)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000268{
269 register PyUnicodeObject *v;
270
271 /* Argument checks */
272 if (unicode == NULL) {
273 PyErr_BadInternalCall();
274 return -1;
275 }
276 v = (PyUnicodeObject *)*unicode;
Guido van Rossum049cd6b2002-10-11 00:43:48 +0000277 if (v == NULL || !PyUnicode_Check(v) || v->ob_refcnt != 1 || length < 0) {
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000278 PyErr_BadInternalCall();
279 return -1;
280 }
281
282 /* Resizing unicode_empty and single character objects is not
283 possible since these are being shared. We simply return a fresh
284 copy with the same Unicode content. */
Tim Petersced69f82003-09-16 20:30:58 +0000285 if (v->length != length &&
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000286 (v == unicode_empty || v->length == 1)) {
287 PyUnicodeObject *w = _PyUnicode_New(length);
288 if (w == NULL)
289 return -1;
290 Py_UNICODE_COPY(w->str, v->str,
291 length < v->length ? length : v->length);
Raymond Hettingerc8df5782003-03-09 07:30:43 +0000292 Py_DECREF(*unicode);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000293 *unicode = (PyObject *)w;
294 return 0;
295 }
296
297 /* Note that we don't have to modify *unicode for unshared Unicode
298 objects, since we can modify them in-place. */
299 return unicode_resize(v, length);
300}
301
302/* Internal API for use in unicodeobject.c only ! */
303#define _PyUnicode_Resize(unicodevar, length) \
304 PyUnicode_Resize(((PyObject **)(unicodevar)), length)
305
Guido van Rossumd57fd912000-03-10 22:53:23 +0000306PyObject *PyUnicode_FromUnicode(const Py_UNICODE *u,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000307 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000308{
309 PyUnicodeObject *unicode;
310
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000311 /* If the Unicode data is known at construction time, we can apply
312 some optimizations which share commonly used objects. */
313 if (u != NULL) {
314
315 /* Optimization for empty strings */
316 if (size == 0 && unicode_empty != NULL) {
317 Py_INCREF(unicode_empty);
318 return (PyObject *)unicode_empty;
319 }
320
321 /* Single character Unicode objects in the Latin-1 range are
322 shared when using this constructor */
323 if (size == 1 && *u < 256) {
324 unicode = unicode_latin1[*u];
325 if (!unicode) {
326 unicode = _PyUnicode_New(1);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000327 if (!unicode)
328 return NULL;
Marc-André Lemburg8879a332001-06-07 12:26:56 +0000329 unicode->str[0] = *u;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000330 unicode_latin1[*u] = unicode;
331 }
332 Py_INCREF(unicode);
333 return (PyObject *)unicode;
334 }
335 }
Tim Petersced69f82003-09-16 20:30:58 +0000336
Guido van Rossumd57fd912000-03-10 22:53:23 +0000337 unicode = _PyUnicode_New(size);
338 if (!unicode)
339 return NULL;
340
341 /* Copy the Unicode data into the new object */
342 if (u != NULL)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000343 Py_UNICODE_COPY(unicode->str, u, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000344
345 return (PyObject *)unicode;
346}
347
348#ifdef HAVE_WCHAR_H
349
350PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000351 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000352{
353 PyUnicodeObject *unicode;
354
355 if (w == NULL) {
356 PyErr_BadInternalCall();
357 return NULL;
358 }
359
360 unicode = _PyUnicode_New(size);
361 if (!unicode)
362 return NULL;
363
364 /* Copy the wchar_t data into the new object */
365#ifdef HAVE_USABLE_WCHAR_T
366 memcpy(unicode->str, w, size * sizeof(wchar_t));
Tim Petersced69f82003-09-16 20:30:58 +0000367#else
Guido van Rossumd57fd912000-03-10 22:53:23 +0000368 {
369 register Py_UNICODE *u;
Martin v. Löwis412fb672006-04-13 06:34:32 +0000370 register Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000371 u = PyUnicode_AS_UNICODE(unicode);
Marc-André Lemburg204bd6d2004-10-15 07:45:05 +0000372 for (i = size; i > 0; i--)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000373 *u++ = *w++;
374 }
375#endif
376
377 return (PyObject *)unicode;
378}
379
Martin v. Löwis18e16552006-02-15 17:27:45 +0000380Py_ssize_t PyUnicode_AsWideChar(PyUnicodeObject *unicode,
381 wchar_t *w,
382 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000383{
384 if (unicode == NULL) {
385 PyErr_BadInternalCall();
386 return -1;
387 }
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +0000388
389 /* If possible, try to copy the 0-termination as well */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000390 if (size > PyUnicode_GET_SIZE(unicode))
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +0000391 size = PyUnicode_GET_SIZE(unicode) + 1;
392
Guido van Rossumd57fd912000-03-10 22:53:23 +0000393#ifdef HAVE_USABLE_WCHAR_T
394 memcpy(w, unicode->str, size * sizeof(wchar_t));
395#else
396 {
397 register Py_UNICODE *u;
Martin v. Löwis412fb672006-04-13 06:34:32 +0000398 register Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000399 u = PyUnicode_AS_UNICODE(unicode);
Marc-André Lemburg204bd6d2004-10-15 07:45:05 +0000400 for (i = size; i > 0; i--)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000401 *w++ = *u++;
402 }
403#endif
404
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +0000405 if (size > PyUnicode_GET_SIZE(unicode))
406 return PyUnicode_GET_SIZE(unicode);
407 else
Guido van Rossumd57fd912000-03-10 22:53:23 +0000408 return size;
409}
410
411#endif
412
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000413PyObject *PyUnicode_FromOrdinal(int ordinal)
414{
Hye-Shik Chang40574832004-04-06 07:24:51 +0000415 Py_UNICODE s[1];
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000416
417#ifdef Py_UNICODE_WIDE
418 if (ordinal < 0 || ordinal > 0x10ffff) {
419 PyErr_SetString(PyExc_ValueError,
420 "unichr() arg not in range(0x110000) "
421 "(wide Python build)");
422 return NULL;
423 }
424#else
425 if (ordinal < 0 || ordinal > 0xffff) {
426 PyErr_SetString(PyExc_ValueError,
427 "unichr() arg not in range(0x10000) "
428 "(narrow Python build)");
429 return NULL;
430 }
431#endif
432
Hye-Shik Chang40574832004-04-06 07:24:51 +0000433 s[0] = (Py_UNICODE)ordinal;
434 return PyUnicode_FromUnicode(s, 1);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000435}
436
Guido van Rossumd57fd912000-03-10 22:53:23 +0000437PyObject *PyUnicode_FromObject(register PyObject *obj)
438{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000439 /* XXX Perhaps we should make this API an alias of
440 PyObject_Unicode() instead ?! */
441 if (PyUnicode_CheckExact(obj)) {
442 Py_INCREF(obj);
443 return obj;
444 }
445 if (PyUnicode_Check(obj)) {
446 /* For a Unicode subtype that's not a Unicode object,
447 return a true Unicode object with the same data. */
448 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(obj),
449 PyUnicode_GET_SIZE(obj));
450 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000451 return PyUnicode_FromEncodedObject(obj, NULL, "strict");
452}
453
454PyObject *PyUnicode_FromEncodedObject(register PyObject *obj,
455 const char *encoding,
456 const char *errors)
457{
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000458 const char *s = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +0000459 Py_ssize_t len;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000460 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +0000461
Guido van Rossumd57fd912000-03-10 22:53:23 +0000462 if (obj == NULL) {
463 PyErr_BadInternalCall();
464 return NULL;
465 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000466
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000467#if 0
468 /* For b/w compatibility we also accept Unicode objects provided
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +0000469 that no encodings is given and then redirect to
470 PyObject_Unicode() which then applies the additional logic for
471 Unicode subclasses.
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000472
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000473 NOTE: This API should really only be used for object which
474 represent *encoded* Unicode !
475
476 */
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000477 if (PyUnicode_Check(obj)) {
478 if (encoding) {
479 PyErr_SetString(PyExc_TypeError,
480 "decoding Unicode is not supported");
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000481 return NULL;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000482 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000483 return PyObject_Unicode(obj);
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000484 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000485#else
486 if (PyUnicode_Check(obj)) {
487 PyErr_SetString(PyExc_TypeError,
488 "decoding Unicode is not supported");
489 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000490 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000491#endif
492
493 /* Coerce object */
494 if (PyString_Check(obj)) {
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000495 s = PyString_AS_STRING(obj);
496 len = PyString_GET_SIZE(obj);
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000497 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000498 else if (PyObject_AsCharBuffer(obj, &s, &len)) {
499 /* Overwrite the error message with something more useful in
500 case of a TypeError. */
501 if (PyErr_ExceptionMatches(PyExc_TypeError))
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000502 PyErr_Format(PyExc_TypeError,
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000503 "coercing to Unicode: need string or buffer, "
504 "%.80s found",
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000505 obj->ob_type->tp_name);
506 goto onError;
507 }
Tim Petersced69f82003-09-16 20:30:58 +0000508
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000509 /* Convert to Unicode */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000510 if (len == 0) {
511 Py_INCREF(unicode_empty);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000512 v = (PyObject *)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000513 }
Tim Petersced69f82003-09-16 20:30:58 +0000514 else
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000515 v = PyUnicode_Decode(s, len, encoding, errors);
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +0000516
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000517 return v;
518
519 onError:
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000520 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000521}
522
523PyObject *PyUnicode_Decode(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000524 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +0000525 const char *encoding,
526 const char *errors)
527{
528 PyObject *buffer = NULL, *unicode;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000529
530 if (encoding == NULL)
Fred Drakee4315f52000-05-09 19:53:39 +0000531 encoding = PyUnicode_GetDefaultEncoding();
532
533 /* Shortcuts for common default encodings */
534 if (strcmp(encoding, "utf-8") == 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000535 return PyUnicode_DecodeUTF8(s, size, errors);
Fred Drakee4315f52000-05-09 19:53:39 +0000536 else if (strcmp(encoding, "latin-1") == 0)
537 return PyUnicode_DecodeLatin1(s, size, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +0000538#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
539 else if (strcmp(encoding, "mbcs") == 0)
540 return PyUnicode_DecodeMBCS(s, size, errors);
541#endif
Fred Drakee4315f52000-05-09 19:53:39 +0000542 else if (strcmp(encoding, "ascii") == 0)
543 return PyUnicode_DecodeASCII(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000544
545 /* Decode via the codec registry */
546 buffer = PyBuffer_FromMemory((void *)s, size);
547 if (buffer == NULL)
548 goto onError;
549 unicode = PyCodec_Decode(buffer, encoding, errors);
550 if (unicode == NULL)
551 goto onError;
552 if (!PyUnicode_Check(unicode)) {
553 PyErr_Format(PyExc_TypeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000554 "decoder did not return an unicode object (type=%.400s)",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000555 unicode->ob_type->tp_name);
556 Py_DECREF(unicode);
557 goto onError;
558 }
559 Py_DECREF(buffer);
560 return unicode;
Tim Petersced69f82003-09-16 20:30:58 +0000561
Guido van Rossumd57fd912000-03-10 22:53:23 +0000562 onError:
563 Py_XDECREF(buffer);
564 return NULL;
565}
566
Marc-André Lemburgd2d45982004-07-08 17:57:32 +0000567PyObject *PyUnicode_AsDecodedObject(PyObject *unicode,
568 const char *encoding,
569 const char *errors)
570{
571 PyObject *v;
572
573 if (!PyUnicode_Check(unicode)) {
574 PyErr_BadArgument();
575 goto onError;
576 }
577
578 if (encoding == NULL)
579 encoding = PyUnicode_GetDefaultEncoding();
580
581 /* Decode via the codec registry */
582 v = PyCodec_Decode(unicode, encoding, errors);
583 if (v == NULL)
584 goto onError;
585 return v;
586
587 onError:
588 return NULL;
589}
590
Guido van Rossumd57fd912000-03-10 22:53:23 +0000591PyObject *PyUnicode_Encode(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000592 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +0000593 const char *encoding,
594 const char *errors)
595{
596 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +0000597
Guido van Rossumd57fd912000-03-10 22:53:23 +0000598 unicode = PyUnicode_FromUnicode(s, size);
599 if (unicode == NULL)
600 return NULL;
601 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
602 Py_DECREF(unicode);
603 return v;
604}
605
Marc-André Lemburgd2d45982004-07-08 17:57:32 +0000606PyObject *PyUnicode_AsEncodedObject(PyObject *unicode,
607 const char *encoding,
608 const char *errors)
609{
610 PyObject *v;
611
612 if (!PyUnicode_Check(unicode)) {
613 PyErr_BadArgument();
614 goto onError;
615 }
616
617 if (encoding == NULL)
618 encoding = PyUnicode_GetDefaultEncoding();
619
620 /* Encode via the codec registry */
621 v = PyCodec_Encode(unicode, encoding, errors);
622 if (v == NULL)
623 goto onError;
624 return v;
625
626 onError:
627 return NULL;
628}
629
Guido van Rossumd57fd912000-03-10 22:53:23 +0000630PyObject *PyUnicode_AsEncodedString(PyObject *unicode,
631 const char *encoding,
632 const char *errors)
633{
634 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +0000635
Guido van Rossumd57fd912000-03-10 22:53:23 +0000636 if (!PyUnicode_Check(unicode)) {
637 PyErr_BadArgument();
638 goto onError;
639 }
Fred Drakee4315f52000-05-09 19:53:39 +0000640
Tim Petersced69f82003-09-16 20:30:58 +0000641 if (encoding == NULL)
Fred Drakee4315f52000-05-09 19:53:39 +0000642 encoding = PyUnicode_GetDefaultEncoding();
643
644 /* Shortcuts for common default encodings */
645 if (errors == NULL) {
646 if (strcmp(encoding, "utf-8") == 0)
Jeremy Hylton9cea41c2001-05-29 17:13:15 +0000647 return PyUnicode_AsUTF8String(unicode);
Fred Drakee4315f52000-05-09 19:53:39 +0000648 else if (strcmp(encoding, "latin-1") == 0)
649 return PyUnicode_AsLatin1String(unicode);
Mark Hammond0ccda1e2003-07-01 00:13:27 +0000650#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
651 else if (strcmp(encoding, "mbcs") == 0)
652 return PyUnicode_AsMBCSString(unicode);
653#endif
Fred Drakee4315f52000-05-09 19:53:39 +0000654 else if (strcmp(encoding, "ascii") == 0)
655 return PyUnicode_AsASCIIString(unicode);
656 }
Guido van Rossumd57fd912000-03-10 22:53:23 +0000657
658 /* Encode via the codec registry */
659 v = PyCodec_Encode(unicode, encoding, errors);
660 if (v == NULL)
661 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000662 if (!PyString_Check(v)) {
663 PyErr_Format(PyExc_TypeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000664 "encoder did not return a string object (type=%.400s)",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000665 v->ob_type->tp_name);
666 Py_DECREF(v);
667 goto onError;
668 }
669 return v;
Tim Petersced69f82003-09-16 20:30:58 +0000670
Guido van Rossumd57fd912000-03-10 22:53:23 +0000671 onError:
672 return NULL;
673}
674
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000675PyObject *_PyUnicode_AsDefaultEncodedString(PyObject *unicode,
676 const char *errors)
677{
678 PyObject *v = ((PyUnicodeObject *)unicode)->defenc;
679
680 if (v)
681 return v;
682 v = PyUnicode_AsEncodedString(unicode, NULL, errors);
683 if (v && errors == NULL)
684 ((PyUnicodeObject *)unicode)->defenc = v;
685 return v;
686}
687
Guido van Rossumd57fd912000-03-10 22:53:23 +0000688Py_UNICODE *PyUnicode_AsUnicode(PyObject *unicode)
689{
690 if (!PyUnicode_Check(unicode)) {
691 PyErr_BadArgument();
692 goto onError;
693 }
694 return PyUnicode_AS_UNICODE(unicode);
695
696 onError:
697 return NULL;
698}
699
Martin v. Löwis18e16552006-02-15 17:27:45 +0000700Py_ssize_t PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000701{
702 if (!PyUnicode_Check(unicode)) {
703 PyErr_BadArgument();
704 goto onError;
705 }
706 return PyUnicode_GET_SIZE(unicode);
707
708 onError:
709 return -1;
710}
711
Thomas Wouters78890102000-07-22 19:25:51 +0000712const char *PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +0000713{
714 return unicode_default_encoding;
715}
716
717int PyUnicode_SetDefaultEncoding(const char *encoding)
718{
719 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +0000720
Fred Drakee4315f52000-05-09 19:53:39 +0000721 /* Make sure the encoding is valid. As side effect, this also
722 loads the encoding into the codec registry cache. */
723 v = _PyCodec_Lookup(encoding);
724 if (v == NULL)
725 goto onError;
726 Py_DECREF(v);
727 strncpy(unicode_default_encoding,
Tim Petersced69f82003-09-16 20:30:58 +0000728 encoding,
Fred Drakee4315f52000-05-09 19:53:39 +0000729 sizeof(unicode_default_encoding));
730 return 0;
731
732 onError:
733 return -1;
734}
735
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000736/* error handling callback helper:
737 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +0000738 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000739 and adjust various state variables.
740 return 0 on success, -1 on error
741*/
742
743static
744int unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
745 const char *encoding, const char *reason,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000746 const char *input, Py_ssize_t insize, Py_ssize_t *startinpos, Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
747 PyObject **output, Py_ssize_t *outpos, Py_UNICODE **outptr)
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000748{
Martin v. Löwis18e16552006-02-15 17:27:45 +0000749 static char *argparse = "O!n;decoding error handler must return (unicode, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000750
751 PyObject *restuple = NULL;
752 PyObject *repunicode = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +0000753 Py_ssize_t outsize = PyUnicode_GET_SIZE(*output);
754 Py_ssize_t requiredsize;
755 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000756 Py_UNICODE *repptr;
Martin v. Löwis18e16552006-02-15 17:27:45 +0000757 Py_ssize_t repsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000758 int res = -1;
759
760 if (*errorHandler == NULL) {
761 *errorHandler = PyCodec_LookupError(errors);
762 if (*errorHandler == NULL)
763 goto onError;
764 }
765
766 if (*exceptionObject == NULL) {
767 *exceptionObject = PyUnicodeDecodeError_Create(
768 encoding, input, insize, *startinpos, *endinpos, reason);
769 if (*exceptionObject == NULL)
770 goto onError;
771 }
772 else {
773 if (PyUnicodeDecodeError_SetStart(*exceptionObject, *startinpos))
774 goto onError;
775 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, *endinpos))
776 goto onError;
777 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
778 goto onError;
779 }
780
781 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
782 if (restuple == NULL)
783 goto onError;
784 if (!PyTuple_Check(restuple)) {
785 PyErr_Format(PyExc_TypeError, &argparse[4]);
786 goto onError;
787 }
788 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
789 goto onError;
790 if (newpos<0)
Walter Dörwald2e0b18a2003-01-31 17:19:08 +0000791 newpos = insize+newpos;
792 if (newpos<0 || newpos>insize) {
Martin v. Löwis2c95cc62006-02-16 06:54:25 +0000793 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
Walter Dörwald2e0b18a2003-01-31 17:19:08 +0000794 goto onError;
795 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000796
797 /* need more space? (at least enough for what we
798 have+the replacement+the rest of the string (starting
799 at the new input position), so we won't have to check space
800 when there are no errors in the rest of the string) */
801 repptr = PyUnicode_AS_UNICODE(repunicode);
802 repsize = PyUnicode_GET_SIZE(repunicode);
803 requiredsize = *outpos + repsize + insize-newpos;
804 if (requiredsize > outsize) {
805 if (requiredsize<2*outsize)
806 requiredsize = 2*outsize;
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +0000807 if (PyUnicode_Resize(output, requiredsize) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000808 goto onError;
809 *outptr = PyUnicode_AS_UNICODE(*output) + *outpos;
810 }
811 *endinpos = newpos;
812 *inptr = input + newpos;
813 Py_UNICODE_COPY(*outptr, repptr, repsize);
814 *outptr += repsize;
815 *outpos += repsize;
816 /* we made it! */
817 res = 0;
818
819 onError:
820 Py_XDECREF(restuple);
821 return res;
822}
823
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000824/* --- UTF-7 Codec -------------------------------------------------------- */
825
826/* see RFC2152 for details */
827
Tim Petersced69f82003-09-16 20:30:58 +0000828static
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000829char utf7_special[128] = {
830 /* indicate whether a UTF-7 character is special i.e. cannot be directly
831 encoded:
832 0 - not special
833 1 - special
834 2 - whitespace (optional)
835 3 - RFC2152 Set O (optional) */
836 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 2, 1, 1,
837 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
838 2, 3, 3, 3, 3, 3, 3, 0, 0, 0, 3, 1, 0, 0, 0, 1,
839 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 0,
840 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
841 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 1, 3, 3, 3,
842 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
843 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 1, 1,
844
845};
846
Marc-André Lemburge115ec82005-10-19 22:33:31 +0000847/* Note: The comparison (c) <= 0 is a trick to work-around gcc
848 warnings about the comparison always being false; since
849 utf7_special[0] is 1, we can safely make that one comparison
850 true */
851
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000852#define SPECIAL(c, encodeO, encodeWS) \
Marc-André Lemburge115ec82005-10-19 22:33:31 +0000853 ((c) > 127 || (c) <= 0 || utf7_special[(c)] == 1 || \
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +0000854 (encodeWS && (utf7_special[(c)] == 2)) || \
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000855 (encodeO && (utf7_special[(c)] == 3)))
856
Marc-André Lemburge115ec82005-10-19 22:33:31 +0000857#define B64(n) \
858 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
859#define B64CHAR(c) \
860 (isalnum(c) || (c) == '+' || (c) == '/')
861#define UB64(c) \
862 ((c) == '+' ? 62 : (c) == '/' ? 63 : (c) >= 'a' ? \
863 (c) - 71 : (c) >= 'A' ? (c) - 65 : (c) + 4 )
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000864
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +0000865#define ENCODE(out, ch, bits) \
866 while (bits >= 6) { \
867 *out++ = B64(ch >> (bits-6)); \
868 bits -= 6; \
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000869 }
870
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +0000871#define DECODE(out, ch, bits, surrogate) \
872 while (bits >= 16) { \
873 Py_UNICODE outCh = (Py_UNICODE) ((ch >> (bits-16)) & 0xffff); \
874 bits -= 16; \
875 if (surrogate) { \
Marc-André Lemburge115ec82005-10-19 22:33:31 +0000876 /* We have already generated an error for the high surrogate \
877 so let's not bother seeing if the low surrogate is correct or not */ \
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +0000878 surrogate = 0; \
879 } else if (0xDC00 <= outCh && outCh <= 0xDFFF) { \
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000880 /* This is a surrogate pair. Unfortunately we can't represent \
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +0000881 it in a 16-bit character */ \
882 surrogate = 1; \
883 errmsg = "code pairs are not supported"; \
884 goto utf7Error; \
885 } else { \
886 *out++ = outCh; \
887 } \
Marc-André Lemburge115ec82005-10-19 22:33:31 +0000888 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000889
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000890PyObject *PyUnicode_DecodeUTF7(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000891 Py_ssize_t size,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000892 const char *errors)
893{
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000894 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +0000895 Py_ssize_t startinpos;
896 Py_ssize_t endinpos;
897 Py_ssize_t outpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000898 const char *e;
899 PyUnicodeObject *unicode;
900 Py_UNICODE *p;
901 const char *errmsg = "";
902 int inShift = 0;
903 unsigned int bitsleft = 0;
904 unsigned long charsleft = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000905 int surrogate = 0;
906 PyObject *errorHandler = NULL;
907 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000908
909 unicode = _PyUnicode_New(size);
910 if (!unicode)
911 return NULL;
912 if (size == 0)
913 return (PyObject *)unicode;
914
915 p = unicode->str;
916 e = s + size;
917
918 while (s < e) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000919 Py_UNICODE ch;
920 restart:
921 ch = *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000922
923 if (inShift) {
924 if ((ch == '-') || !B64CHAR(ch)) {
925 inShift = 0;
926 s++;
Tim Petersced69f82003-09-16 20:30:58 +0000927
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000928 /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
929 if (bitsleft >= 6) {
930 /* The shift sequence has a partial character in it. If
931 bitsleft < 6 then we could just classify it as padding
932 but that is not the case here */
933
934 errmsg = "partial character in shift sequence";
Tim Petersced69f82003-09-16 20:30:58 +0000935 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000936 }
937 /* According to RFC2152 the remaining bits should be zero. We
Tim Petersced69f82003-09-16 20:30:58 +0000938 choose to signal an error/insert a replacement character
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000939 here so indicate the potential of a misencoded character. */
940
941 /* On x86, a << b == a << (b%32) so make sure that bitsleft != 0 */
942 if (bitsleft && charsleft << (sizeof(charsleft) * 8 - bitsleft)) {
943 errmsg = "non-zero padding bits in shift sequence";
Tim Petersced69f82003-09-16 20:30:58 +0000944 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000945 }
946
947 if (ch == '-') {
948 if ((s < e) && (*(s) == '-')) {
Tim Petersced69f82003-09-16 20:30:58 +0000949 *p++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000950 inShift = 1;
951 }
952 } else if (SPECIAL(ch,0,0)) {
953 errmsg = "unexpected special character";
Tim Petersced69f82003-09-16 20:30:58 +0000954 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000955 } else {
956 *p++ = ch;
957 }
958 } else {
959 charsleft = (charsleft << 6) | UB64(ch);
960 bitsleft += 6;
961 s++;
962 /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
963 }
964 }
965 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000966 startinpos = s-starts;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000967 s++;
968 if (s < e && *s == '-') {
969 s++;
970 *p++ = '+';
971 } else
972 {
973 inShift = 1;
974 bitsleft = 0;
975 }
976 }
977 else if (SPECIAL(ch,0,0)) {
978 errmsg = "unexpected special character";
979 s++;
Tim Petersced69f82003-09-16 20:30:58 +0000980 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000981 }
982 else {
983 *p++ = ch;
984 s++;
985 }
986 continue;
987 utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000988 outpos = p-PyUnicode_AS_UNICODE(unicode);
989 endinpos = s-starts;
990 if (unicode_decode_call_errorhandler(
991 errors, &errorHandler,
992 "utf7", errmsg,
993 starts, size, &startinpos, &endinpos, &exc, &s,
994 (PyObject **)&unicode, &outpos, &p))
995 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000996 }
997
998 if (inShift) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000999 outpos = p-PyUnicode_AS_UNICODE(unicode);
1000 endinpos = size;
1001 if (unicode_decode_call_errorhandler(
1002 errors, &errorHandler,
1003 "utf7", "unterminated shift sequence",
1004 starts, size, &startinpos, &endinpos, &exc, &s,
1005 (PyObject **)&unicode, &outpos, &p))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001006 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001007 if (s < e)
1008 goto restart;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001009 }
1010
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00001011 if (_PyUnicode_Resize(&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001012 goto onError;
1013
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001014 Py_XDECREF(errorHandler);
1015 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001016 return (PyObject *)unicode;
1017
1018onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001019 Py_XDECREF(errorHandler);
1020 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001021 Py_DECREF(unicode);
1022 return NULL;
1023}
1024
1025
1026PyObject *PyUnicode_EncodeUTF7(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001027 Py_ssize_t size,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001028 int encodeSetO,
1029 int encodeWhiteSpace,
1030 const char *errors)
1031{
1032 PyObject *v;
1033 /* It might be possible to tighten this worst case */
Martin v. Löwis18e16552006-02-15 17:27:45 +00001034 Py_ssize_t cbAllocated = 5 * size;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001035 int inShift = 0;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001036 Py_ssize_t i = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001037 unsigned int bitsleft = 0;
1038 unsigned long charsleft = 0;
1039 char * out;
1040 char * start;
1041
1042 if (size == 0)
1043 return PyString_FromStringAndSize(NULL, 0);
1044
1045 v = PyString_FromStringAndSize(NULL, cbAllocated);
1046 if (v == NULL)
1047 return NULL;
1048
1049 start = out = PyString_AS_STRING(v);
1050 for (;i < size; ++i) {
1051 Py_UNICODE ch = s[i];
1052
1053 if (!inShift) {
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001054 if (ch == '+') {
1055 *out++ = '+';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001056 *out++ = '-';
1057 } else if (SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
1058 charsleft = ch;
1059 bitsleft = 16;
1060 *out++ = '+';
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001061 /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001062 inShift = bitsleft > 0;
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001063 } else {
1064 *out++ = (char) ch;
1065 }
1066 } else {
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001067 if (!SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
1068 *out++ = B64(charsleft << (6-bitsleft));
1069 charsleft = 0;
1070 bitsleft = 0;
1071 /* Characters not in the BASE64 set implicitly unshift the sequence
1072 so no '-' is required, except if the character is itself a '-' */
1073 if (B64CHAR(ch) || ch == '-') {
1074 *out++ = '-';
1075 }
1076 inShift = 0;
1077 *out++ = (char) ch;
1078 } else {
1079 bitsleft += 16;
1080 charsleft = (charsleft << 16) | ch;
1081 /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
1082
1083 /* If the next character is special then we dont' need to terminate
Tim Petersced69f82003-09-16 20:30:58 +00001084 the shift sequence. If the next character is not a BASE64 character
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001085 or '-' then the shift sequence will be terminated implicitly and we
1086 don't have to insert a '-'. */
1087
1088 if (bitsleft == 0) {
1089 if (i + 1 < size) {
1090 Py_UNICODE ch2 = s[i+1];
1091
1092 if (SPECIAL(ch2, encodeSetO, encodeWhiteSpace)) {
Tim Petersced69f82003-09-16 20:30:58 +00001093
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001094 } else if (B64CHAR(ch2) || ch2 == '-') {
1095 *out++ = '-';
1096 inShift = 0;
1097 } else {
1098 inShift = 0;
1099 }
1100
1101 }
1102 else {
1103 *out++ = '-';
1104 inShift = 0;
1105 }
1106 }
Tim Petersced69f82003-09-16 20:30:58 +00001107 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001108 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001109 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001110 if (bitsleft) {
1111 *out++= B64(charsleft << (6-bitsleft) );
1112 *out++ = '-';
1113 }
1114
Tim Peters5de98422002-04-27 18:44:32 +00001115 _PyString_Resize(&v, out - start);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001116 return v;
1117}
1118
1119#undef SPECIAL
1120#undef B64
1121#undef B64CHAR
1122#undef UB64
1123#undef ENCODE
1124#undef DECODE
1125
Guido van Rossumd57fd912000-03-10 22:53:23 +00001126/* --- UTF-8 Codec -------------------------------------------------------- */
1127
Tim Petersced69f82003-09-16 20:30:58 +00001128static
Guido van Rossumd57fd912000-03-10 22:53:23 +00001129char utf8_code_length[256] = {
1130 /* Map UTF-8 encoded prefix byte to sequence length. zero means
1131 illegal prefix. see RFC 2279 for details */
1132 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1133 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1134 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1135 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1136 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1137 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1138 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1139 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1140 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1141 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1142 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1143 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1144 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1145 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1146 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
1147 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0
1148};
1149
Guido van Rossumd57fd912000-03-10 22:53:23 +00001150PyObject *PyUnicode_DecodeUTF8(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001151 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00001152 const char *errors)
1153{
Walter Dörwald69652032004-09-07 20:24:22 +00001154 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
1155}
1156
1157PyObject *PyUnicode_DecodeUTF8Stateful(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001158 Py_ssize_t size,
Walter Dörwald69652032004-09-07 20:24:22 +00001159 const char *errors,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001160 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00001161{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001162 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001163 int n;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001164 Py_ssize_t startinpos;
1165 Py_ssize_t endinpos;
1166 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001167 const char *e;
1168 PyUnicodeObject *unicode;
1169 Py_UNICODE *p;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001170 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001171 PyObject *errorHandler = NULL;
1172 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001173
1174 /* Note: size will always be longer than the resulting Unicode
1175 character count */
1176 unicode = _PyUnicode_New(size);
1177 if (!unicode)
1178 return NULL;
Walter Dörwald69652032004-09-07 20:24:22 +00001179 if (size == 0) {
1180 if (consumed)
1181 *consumed = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001182 return (PyObject *)unicode;
Walter Dörwald69652032004-09-07 20:24:22 +00001183 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001184
1185 /* Unpack UTF-8 encoded data */
1186 p = unicode->str;
1187 e = s + size;
1188
1189 while (s < e) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001190 Py_UCS4 ch = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001191
1192 if (ch < 0x80) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001193 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001194 s++;
1195 continue;
1196 }
1197
1198 n = utf8_code_length[ch];
1199
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001200 if (s + n > e) {
Walter Dörwald69652032004-09-07 20:24:22 +00001201 if (consumed)
1202 break;
1203 else {
1204 errmsg = "unexpected end of data";
1205 startinpos = s-starts;
1206 endinpos = size;
1207 goto utf8Error;
1208 }
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001209 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001210
1211 switch (n) {
1212
1213 case 0:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001214 errmsg = "unexpected code byte";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001215 startinpos = s-starts;
1216 endinpos = startinpos+1;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001217 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001218
1219 case 1:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001220 errmsg = "internal error";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001221 startinpos = s-starts;
1222 endinpos = startinpos+1;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001223 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001224
1225 case 2:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001226 if ((s[1] & 0xc0) != 0x80) {
1227 errmsg = "invalid data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001228 startinpos = s-starts;
1229 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001230 goto utf8Error;
1231 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001232 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001233 if (ch < 0x80) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001234 startinpos = s-starts;
1235 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001236 errmsg = "illegal encoding";
1237 goto utf8Error;
1238 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001239 else
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001240 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001241 break;
1242
1243 case 3:
Tim Petersced69f82003-09-16 20:30:58 +00001244 if ((s[1] & 0xc0) != 0x80 ||
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001245 (s[2] & 0xc0) != 0x80) {
1246 errmsg = "invalid data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001247 startinpos = s-starts;
1248 endinpos = startinpos+3;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001249 goto utf8Error;
1250 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001251 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001252 if (ch < 0x0800) {
1253 /* Note: UTF-8 encodings of surrogates are considered
Tim Petersced69f82003-09-16 20:30:58 +00001254 legal UTF-8 sequences;
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001255
1256 XXX For wide builds (UCS-4) we should probably try
1257 to recombine the surrogates into a single code
1258 unit.
1259 */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001260 errmsg = "illegal encoding";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001261 startinpos = s-starts;
1262 endinpos = startinpos+3;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001263 goto utf8Error;
1264 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001265 else
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001266 *p++ = (Py_UNICODE)ch;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001267 break;
1268
1269 case 4:
1270 if ((s[1] & 0xc0) != 0x80 ||
1271 (s[2] & 0xc0) != 0x80 ||
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001272 (s[3] & 0xc0) != 0x80) {
1273 errmsg = "invalid data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001274 startinpos = s-starts;
1275 endinpos = startinpos+4;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001276 goto utf8Error;
1277 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001278 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
1279 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
1280 /* validate and convert to UTF-16 */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001281 if ((ch < 0x10000) /* minimum value allowed for 4
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001282 byte encoding */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001283 || (ch > 0x10ffff)) /* maximum value allowed for
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001284 UTF-16 */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001285 {
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001286 errmsg = "illegal encoding";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001287 startinpos = s-starts;
1288 endinpos = startinpos+4;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001289 goto utf8Error;
1290 }
Fredrik Lundh8f455852001-06-27 18:59:43 +00001291#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001292 *p++ = (Py_UNICODE)ch;
1293#else
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001294 /* compute and append the two surrogates: */
Tim Petersced69f82003-09-16 20:30:58 +00001295
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001296 /* translate from 10000..10FFFF to 0..FFFF */
1297 ch -= 0x10000;
Tim Petersced69f82003-09-16 20:30:58 +00001298
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001299 /* high surrogate = top 10 bits added to D800 */
1300 *p++ = (Py_UNICODE)(0xD800 + (ch >> 10));
Tim Petersced69f82003-09-16 20:30:58 +00001301
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001302 /* low surrogate = bottom 10 bits added to DC00 */
Fredrik Lundh45714e92001-06-26 16:39:36 +00001303 *p++ = (Py_UNICODE)(0xDC00 + (ch & 0x03FF));
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001304#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00001305 break;
1306
1307 default:
1308 /* Other sizes are only needed for UCS-4 */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001309 errmsg = "unsupported Unicode code range";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001310 startinpos = s-starts;
1311 endinpos = startinpos+n;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001312 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001313 }
1314 s += n;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001315 continue;
Tim Petersced69f82003-09-16 20:30:58 +00001316
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001317 utf8Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001318 outpos = p-PyUnicode_AS_UNICODE(unicode);
1319 if (unicode_decode_call_errorhandler(
1320 errors, &errorHandler,
1321 "utf8", errmsg,
1322 starts, size, &startinpos, &endinpos, &exc, &s,
1323 (PyObject **)&unicode, &outpos, &p))
1324 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001325 }
Walter Dörwald69652032004-09-07 20:24:22 +00001326 if (consumed)
1327 *consumed = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001328
1329 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00001330 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001331 goto onError;
1332
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001333 Py_XDECREF(errorHandler);
1334 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001335 return (PyObject *)unicode;
1336
1337onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001338 Py_XDECREF(errorHandler);
1339 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001340 Py_DECREF(unicode);
1341 return NULL;
1342}
1343
Tim Peters602f7402002-04-27 18:03:26 +00001344/* Allocation strategy: if the string is short, convert into a stack buffer
1345 and allocate exactly as much space needed at the end. Else allocate the
1346 maximum possible needed (4 result bytes per Unicode character), and return
1347 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001348*/
Tim Peters7e3d9612002-04-21 03:26:37 +00001349PyObject *
1350PyUnicode_EncodeUTF8(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001351 Py_ssize_t size,
Tim Peters7e3d9612002-04-21 03:26:37 +00001352 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001353{
Tim Peters602f7402002-04-27 18:03:26 +00001354#define MAX_SHORT_UNICHARS 300 /* largest size we'll do on the stack */
Tim Peters0eca65c2002-04-21 17:28:06 +00001355
Martin v. Löwis18e16552006-02-15 17:27:45 +00001356 Py_ssize_t i; /* index into s of next input byte */
Tim Peters602f7402002-04-27 18:03:26 +00001357 PyObject *v; /* result string object */
1358 char *p; /* next free byte in output buffer */
Martin v. Löwis18e16552006-02-15 17:27:45 +00001359 Py_ssize_t nallocated; /* number of result bytes allocated */
Martin v. Löwis412fb672006-04-13 06:34:32 +00001360 Py_ssize_t nneeded; /* number of result bytes needed */
Tim Peters602f7402002-04-27 18:03:26 +00001361 char stackbuf[MAX_SHORT_UNICHARS * 4];
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001362
Tim Peters602f7402002-04-27 18:03:26 +00001363 assert(s != NULL);
1364 assert(size >= 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001365
Tim Peters602f7402002-04-27 18:03:26 +00001366 if (size <= MAX_SHORT_UNICHARS) {
1367 /* Write into the stack buffer; nallocated can't overflow.
1368 * At the end, we'll allocate exactly as much heap space as it
1369 * turns out we need.
1370 */
1371 nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int);
1372 v = NULL; /* will allocate after we're done */
1373 p = stackbuf;
1374 }
1375 else {
1376 /* Overallocate on the heap, and give the excess back at the end. */
1377 nallocated = size * 4;
1378 if (nallocated / 4 != size) /* overflow! */
1379 return PyErr_NoMemory();
1380 v = PyString_FromStringAndSize(NULL, nallocated);
1381 if (v == NULL)
1382 return NULL;
1383 p = PyString_AS_STRING(v);
1384 }
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001385
Tim Peters602f7402002-04-27 18:03:26 +00001386 for (i = 0; i < size;) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001387 Py_UCS4 ch = s[i++];
Marc-André Lemburg3688a882002-02-06 18:09:02 +00001388
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001389 if (ch < 0x80)
Tim Peters602f7402002-04-27 18:03:26 +00001390 /* Encode ASCII */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001391 *p++ = (char) ch;
Marc-André Lemburg3688a882002-02-06 18:09:02 +00001392
Guido van Rossumd57fd912000-03-10 22:53:23 +00001393 else if (ch < 0x0800) {
Tim Peters602f7402002-04-27 18:03:26 +00001394 /* Encode Latin-1 */
Marc-André Lemburgdc724d62002-02-06 18:20:19 +00001395 *p++ = (char)(0xc0 | (ch >> 6));
1396 *p++ = (char)(0x80 | (ch & 0x3f));
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001397 }
Marc-André Lemburg3688a882002-02-06 18:09:02 +00001398 else {
Tim Peters602f7402002-04-27 18:03:26 +00001399 /* Encode UCS2 Unicode ordinals */
1400 if (ch < 0x10000) {
1401 /* Special case: check for high surrogate */
1402 if (0xD800 <= ch && ch <= 0xDBFF && i != size) {
1403 Py_UCS4 ch2 = s[i];
1404 /* Check for low surrogate and combine the two to
1405 form a UCS4 value */
1406 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001407 ch = ((ch - 0xD800) << 10 | (ch2 - 0xDC00)) + 0x10000;
Tim Peters602f7402002-04-27 18:03:26 +00001408 i++;
1409 goto encodeUCS4;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001410 }
Tim Peters602f7402002-04-27 18:03:26 +00001411 /* Fall through: handles isolated high surrogates */
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001412 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001413 *p++ = (char)(0xe0 | (ch >> 12));
Tim Peters602f7402002-04-27 18:03:26 +00001414 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
1415 *p++ = (char)(0x80 | (ch & 0x3f));
1416 continue;
1417 }
1418encodeUCS4:
1419 /* Encode UCS4 Unicode ordinals */
1420 *p++ = (char)(0xf0 | (ch >> 18));
1421 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
1422 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
1423 *p++ = (char)(0x80 | (ch & 0x3f));
1424 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001425 }
Tim Peters0eca65c2002-04-21 17:28:06 +00001426
Tim Peters602f7402002-04-27 18:03:26 +00001427 if (v == NULL) {
1428 /* This was stack allocated. */
Martin v. Löwis412fb672006-04-13 06:34:32 +00001429 nneeded = p - stackbuf;
Tim Peters602f7402002-04-27 18:03:26 +00001430 assert(nneeded <= nallocated);
1431 v = PyString_FromStringAndSize(stackbuf, nneeded);
1432 }
1433 else {
1434 /* Cut back to size actually needed. */
Martin v. Löwis412fb672006-04-13 06:34:32 +00001435 nneeded = p - PyString_AS_STRING(v);
Tim Peters602f7402002-04-27 18:03:26 +00001436 assert(nneeded <= nallocated);
1437 _PyString_Resize(&v, nneeded);
1438 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001439 return v;
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001440
Tim Peters602f7402002-04-27 18:03:26 +00001441#undef MAX_SHORT_UNICHARS
Guido van Rossumd57fd912000-03-10 22:53:23 +00001442}
1443
Guido van Rossumd57fd912000-03-10 22:53:23 +00001444PyObject *PyUnicode_AsUTF8String(PyObject *unicode)
1445{
Guido van Rossumd57fd912000-03-10 22:53:23 +00001446 if (!PyUnicode_Check(unicode)) {
1447 PyErr_BadArgument();
1448 return NULL;
1449 }
Barry Warsaw2dd4abf2000-08-18 06:58:15 +00001450 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
1451 PyUnicode_GET_SIZE(unicode),
1452 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001453}
1454
1455/* --- UTF-16 Codec ------------------------------------------------------- */
1456
Tim Peters772747b2001-08-09 22:21:55 +00001457PyObject *
1458PyUnicode_DecodeUTF16(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001459 Py_ssize_t size,
Tim Peters772747b2001-08-09 22:21:55 +00001460 const char *errors,
1461 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001462{
Walter Dörwald69652032004-09-07 20:24:22 +00001463 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
1464}
1465
1466PyObject *
1467PyUnicode_DecodeUTF16Stateful(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001468 Py_ssize_t size,
Walter Dörwald69652032004-09-07 20:24:22 +00001469 const char *errors,
1470 int *byteorder,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001471 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00001472{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001473 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001474 Py_ssize_t startinpos;
1475 Py_ssize_t endinpos;
1476 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001477 PyUnicodeObject *unicode;
1478 Py_UNICODE *p;
Tim Peters772747b2001-08-09 22:21:55 +00001479 const unsigned char *q, *e;
1480 int bo = 0; /* assume native ordering by default */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001481 const char *errmsg = "";
Tim Peters772747b2001-08-09 22:21:55 +00001482 /* Offsets from q for retrieving byte pairs in the right order. */
1483#ifdef BYTEORDER_IS_LITTLE_ENDIAN
1484 int ihi = 1, ilo = 0;
1485#else
1486 int ihi = 0, ilo = 1;
1487#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001488 PyObject *errorHandler = NULL;
1489 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001490
1491 /* Note: size will always be longer than the resulting Unicode
1492 character count */
1493 unicode = _PyUnicode_New(size);
1494 if (!unicode)
1495 return NULL;
1496 if (size == 0)
1497 return (PyObject *)unicode;
1498
1499 /* Unpack UTF-16 encoded data */
1500 p = unicode->str;
Tim Peters772747b2001-08-09 22:21:55 +00001501 q = (unsigned char *)s;
1502 e = q + size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001503
1504 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00001505 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001506
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001507 /* Check for BOM marks (U+FEFF) in the input and adjust current
1508 byte order setting accordingly. In native mode, the leading BOM
1509 mark is skipped, in all other modes, it is copied to the output
1510 stream as-is (giving a ZWNBSP character). */
1511 if (bo == 0) {
Walter Dörwald69652032004-09-07 20:24:22 +00001512 if (size >= 2) {
1513 const Py_UNICODE bom = (q[ihi] << 8) | q[ilo];
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001514#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Walter Dörwald69652032004-09-07 20:24:22 +00001515 if (bom == 0xFEFF) {
1516 q += 2;
1517 bo = -1;
1518 }
1519 else if (bom == 0xFFFE) {
1520 q += 2;
1521 bo = 1;
1522 }
Tim Petersced69f82003-09-16 20:30:58 +00001523#else
Walter Dörwald69652032004-09-07 20:24:22 +00001524 if (bom == 0xFEFF) {
1525 q += 2;
1526 bo = 1;
1527 }
1528 else if (bom == 0xFFFE) {
1529 q += 2;
1530 bo = -1;
1531 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001532#endif
Walter Dörwald69652032004-09-07 20:24:22 +00001533 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001534 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001535
Tim Peters772747b2001-08-09 22:21:55 +00001536 if (bo == -1) {
1537 /* force LE */
1538 ihi = 1;
1539 ilo = 0;
1540 }
1541 else if (bo == 1) {
1542 /* force BE */
1543 ihi = 0;
1544 ilo = 1;
1545 }
1546
1547 while (q < e) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001548 Py_UNICODE ch;
Walter Dörwald69652032004-09-07 20:24:22 +00001549 /* remaining bytes at the end? (size should be even) */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001550 if (e-q<2) {
Walter Dörwald69652032004-09-07 20:24:22 +00001551 if (consumed)
1552 break;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001553 errmsg = "truncated data";
1554 startinpos = ((const char *)q)-starts;
1555 endinpos = ((const char *)e)-starts;
1556 goto utf16Error;
1557 /* The remaining input chars are ignored if the callback
1558 chooses to skip the input */
1559 }
1560 ch = (q[ihi] << 8) | q[ilo];
1561
Tim Peters772747b2001-08-09 22:21:55 +00001562 q += 2;
1563
Guido van Rossumd57fd912000-03-10 22:53:23 +00001564 if (ch < 0xD800 || ch > 0xDFFF) {
1565 *p++ = ch;
1566 continue;
1567 }
1568
1569 /* UTF-16 code pair: */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001570 if (q >= e) {
1571 errmsg = "unexpected end of data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001572 startinpos = (((const char *)q)-2)-starts;
1573 endinpos = ((const char *)e)-starts;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001574 goto utf16Error;
1575 }
Martin v. Löwisac93bc22001-06-26 22:43:40 +00001576 if (0xD800 <= ch && ch <= 0xDBFF) {
Tim Peters772747b2001-08-09 22:21:55 +00001577 Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo];
1578 q += 2;
Martin v. Löwisac93bc22001-06-26 22:43:40 +00001579 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Fredrik Lundh8f455852001-06-27 18:59:43 +00001580#ifndef Py_UNICODE_WIDE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001581 *p++ = ch;
1582 *p++ = ch2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001583#else
1584 *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001585#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001586 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001587 }
1588 else {
1589 errmsg = "illegal UTF-16 surrogate";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001590 startinpos = (((const char *)q)-4)-starts;
1591 endinpos = startinpos+2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001592 goto utf16Error;
1593 }
1594
Guido van Rossumd57fd912000-03-10 22:53:23 +00001595 }
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001596 errmsg = "illegal encoding";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001597 startinpos = (((const char *)q)-2)-starts;
1598 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001599 /* Fall through to report the error */
1600
1601 utf16Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001602 outpos = p-PyUnicode_AS_UNICODE(unicode);
1603 if (unicode_decode_call_errorhandler(
1604 errors, &errorHandler,
1605 "utf16", errmsg,
1606 starts, size, &startinpos, &endinpos, &exc, (const char **)&q,
1607 (PyObject **)&unicode, &outpos, &p))
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001608 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001609 }
1610
1611 if (byteorder)
1612 *byteorder = bo;
1613
Walter Dörwald69652032004-09-07 20:24:22 +00001614 if (consumed)
1615 *consumed = (const char *)q-starts;
1616
Guido van Rossumd57fd912000-03-10 22:53:23 +00001617 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00001618 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001619 goto onError;
1620
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001621 Py_XDECREF(errorHandler);
1622 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001623 return (PyObject *)unicode;
1624
1625onError:
1626 Py_DECREF(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001627 Py_XDECREF(errorHandler);
1628 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001629 return NULL;
1630}
1631
Tim Peters772747b2001-08-09 22:21:55 +00001632PyObject *
1633PyUnicode_EncodeUTF16(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001634 Py_ssize_t size,
Tim Peters772747b2001-08-09 22:21:55 +00001635 const char *errors,
1636 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001637{
1638 PyObject *v;
Tim Peters772747b2001-08-09 22:21:55 +00001639 unsigned char *p;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00001640#ifdef Py_UNICODE_WIDE
Tim Peters772747b2001-08-09 22:21:55 +00001641 int i, pairs;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00001642#else
1643 const int pairs = 0;
1644#endif
Tim Peters772747b2001-08-09 22:21:55 +00001645 /* Offsets from p for storing byte pairs in the right order. */
1646#ifdef BYTEORDER_IS_LITTLE_ENDIAN
1647 int ihi = 1, ilo = 0;
1648#else
1649 int ihi = 0, ilo = 1;
1650#endif
1651
1652#define STORECHAR(CH) \
1653 do { \
1654 p[ihi] = ((CH) >> 8) & 0xff; \
1655 p[ilo] = (CH) & 0xff; \
1656 p += 2; \
1657 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001658
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00001659#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001660 for (i = pairs = 0; i < size; i++)
1661 if (s[i] >= 0x10000)
1662 pairs++;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00001663#endif
Tim Petersced69f82003-09-16 20:30:58 +00001664 v = PyString_FromStringAndSize(NULL,
Tim Peters772747b2001-08-09 22:21:55 +00001665 2 * (size + pairs + (byteorder == 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +00001666 if (v == NULL)
1667 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001668
Tim Peters772747b2001-08-09 22:21:55 +00001669 p = (unsigned char *)PyString_AS_STRING(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001670 if (byteorder == 0)
Tim Peters772747b2001-08-09 22:21:55 +00001671 STORECHAR(0xFEFF);
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00001672 if (size == 0)
Marc-André Lemburgb7520772000-08-14 11:29:19 +00001673 return v;
Tim Peters772747b2001-08-09 22:21:55 +00001674
1675 if (byteorder == -1) {
1676 /* force LE */
1677 ihi = 1;
1678 ilo = 0;
1679 }
1680 else if (byteorder == 1) {
1681 /* force BE */
1682 ihi = 0;
1683 ilo = 1;
1684 }
1685
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001686 while (size-- > 0) {
1687 Py_UNICODE ch = *s++;
1688 Py_UNICODE ch2 = 0;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00001689#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001690 if (ch >= 0x10000) {
Tim Peters772747b2001-08-09 22:21:55 +00001691 ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF);
1692 ch = 0xD800 | ((ch-0x10000) >> 10);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001693 }
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00001694#endif
Tim Peters772747b2001-08-09 22:21:55 +00001695 STORECHAR(ch);
1696 if (ch2)
1697 STORECHAR(ch2);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001698 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001699 return v;
Tim Peters772747b2001-08-09 22:21:55 +00001700#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00001701}
1702
1703PyObject *PyUnicode_AsUTF16String(PyObject *unicode)
1704{
1705 if (!PyUnicode_Check(unicode)) {
1706 PyErr_BadArgument();
1707 return NULL;
1708 }
1709 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
1710 PyUnicode_GET_SIZE(unicode),
1711 NULL,
1712 0);
1713}
1714
1715/* --- Unicode Escape Codec ----------------------------------------------- */
1716
Fredrik Lundh06d12682001-01-24 07:59:11 +00001717static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00001718
Guido van Rossumd57fd912000-03-10 22:53:23 +00001719PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001720 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00001721 const char *errors)
1722{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001723 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001724 Py_ssize_t startinpos;
1725 Py_ssize_t endinpos;
1726 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001727 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001728 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001729 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001730 const char *end;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001731 char* message;
1732 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001733 PyObject *errorHandler = NULL;
1734 PyObject *exc = NULL;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001735
Guido van Rossumd57fd912000-03-10 22:53:23 +00001736 /* Escaped strings will always be longer than the resulting
1737 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001738 length after conversion to the true value.
1739 (but if the error callback returns a long replacement string
1740 we'll have to allocate more space) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001741 v = _PyUnicode_New(size);
1742 if (v == NULL)
1743 goto onError;
1744 if (size == 0)
1745 return (PyObject *)v;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001746
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001747 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001748 end = s + size;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001749
Guido van Rossumd57fd912000-03-10 22:53:23 +00001750 while (s < end) {
1751 unsigned char c;
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00001752 Py_UNICODE x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001753 int digits;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001754
1755 /* Non-escape characters are interpreted as Unicode ordinals */
1756 if (*s != '\\') {
Fredrik Lundhccc74732001-02-18 22:13:49 +00001757 *p++ = (unsigned char) *s++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001758 continue;
1759 }
1760
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001761 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001762 /* \ - Escapes */
1763 s++;
1764 switch (*s++) {
1765
1766 /* \x escapes */
1767 case '\n': break;
1768 case '\\': *p++ = '\\'; break;
1769 case '\'': *p++ = '\''; break;
1770 case '\"': *p++ = '\"'; break;
1771 case 'b': *p++ = '\b'; break;
1772 case 'f': *p++ = '\014'; break; /* FF */
1773 case 't': *p++ = '\t'; break;
1774 case 'n': *p++ = '\n'; break;
1775 case 'r': *p++ = '\r'; break;
1776 case 'v': *p++ = '\013'; break; /* VT */
1777 case 'a': *p++ = '\007'; break; /* BEL, not classic C */
1778
1779 /* \OOO (octal) escapes */
1780 case '0': case '1': case '2': case '3':
1781 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001782 x = s[-1] - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00001783 if ('0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001784 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00001785 if ('0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001786 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00001787 }
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001788 *p++ = x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001789 break;
1790
Fredrik Lundhccc74732001-02-18 22:13:49 +00001791 /* hex escapes */
1792 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001793 case 'x':
Fredrik Lundhccc74732001-02-18 22:13:49 +00001794 digits = 2;
1795 message = "truncated \\xXX escape";
1796 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001797
Fredrik Lundhccc74732001-02-18 22:13:49 +00001798 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001799 case 'u':
Fredrik Lundhccc74732001-02-18 22:13:49 +00001800 digits = 4;
1801 message = "truncated \\uXXXX escape";
1802 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001803
Fredrik Lundhccc74732001-02-18 22:13:49 +00001804 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00001805 case 'U':
Fredrik Lundhccc74732001-02-18 22:13:49 +00001806 digits = 8;
1807 message = "truncated \\UXXXXXXXX escape";
1808 hexescape:
1809 chr = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001810 outpos = p-PyUnicode_AS_UNICODE(v);
1811 if (s+digits>end) {
1812 endinpos = size;
1813 if (unicode_decode_call_errorhandler(
1814 errors, &errorHandler,
1815 "unicodeescape", "end of string in escape sequence",
1816 starts, size, &startinpos, &endinpos, &exc, &s,
1817 (PyObject **)&v, &outpos, &p))
1818 goto onError;
1819 goto nextByte;
1820 }
1821 for (i = 0; i < digits; ++i) {
Fredrik Lundhccc74732001-02-18 22:13:49 +00001822 c = (unsigned char) s[i];
Fredrik Lundhdf846752000-09-03 11:29:49 +00001823 if (!isxdigit(c)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001824 endinpos = (s+i+1)-starts;
1825 if (unicode_decode_call_errorhandler(
1826 errors, &errorHandler,
1827 "unicodeescape", message,
1828 starts, size, &startinpos, &endinpos, &exc, &s,
1829 (PyObject **)&v, &outpos, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00001830 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001831 goto nextByte;
Fredrik Lundhdf846752000-09-03 11:29:49 +00001832 }
1833 chr = (chr<<4) & ~0xF;
1834 if (c >= '0' && c <= '9')
1835 chr += c - '0';
1836 else if (c >= 'a' && c <= 'f')
1837 chr += 10 + c - 'a';
1838 else
1839 chr += 10 + c - 'A';
1840 }
1841 s += i;
Jeremy Hylton504de6b2003-10-06 05:08:26 +00001842 if (chr == 0xffffffff && PyErr_Occurred())
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001843 /* _decoding_error will have already written into the
1844 target buffer. */
1845 break;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001846 store:
Fredrik Lundhdf846752000-09-03 11:29:49 +00001847 /* when we get here, chr is a 32-bit unicode character */
1848 if (chr <= 0xffff)
1849 /* UCS-2 character */
1850 *p++ = (Py_UNICODE) chr;
1851 else if (chr <= 0x10ffff) {
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001852 /* UCS-4 character. Either store directly, or as
Walter Dörwald8c077222002-03-25 11:16:18 +00001853 surrogate pair. */
Fredrik Lundh8f455852001-06-27 18:59:43 +00001854#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001855 *p++ = chr;
1856#else
Fredrik Lundhdf846752000-09-03 11:29:49 +00001857 chr -= 0x10000L;
1858 *p++ = 0xD800 + (Py_UNICODE) (chr >> 10);
Fredrik Lundh45714e92001-06-26 16:39:36 +00001859 *p++ = 0xDC00 + (Py_UNICODE) (chr & 0x03FF);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001860#endif
Fredrik Lundhdf846752000-09-03 11:29:49 +00001861 } else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001862 endinpos = s-starts;
1863 outpos = p-PyUnicode_AS_UNICODE(v);
1864 if (unicode_decode_call_errorhandler(
1865 errors, &errorHandler,
1866 "unicodeescape", "illegal Unicode character",
1867 starts, size, &startinpos, &endinpos, &exc, &s,
1868 (PyObject **)&v, &outpos, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00001869 goto onError;
1870 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00001871 break;
1872
1873 /* \N{name} */
1874 case 'N':
1875 message = "malformed \\N character escape";
1876 if (ucnhash_CAPI == NULL) {
1877 /* load the unicode data module */
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00001878 PyObject *m, *api;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001879 m = PyImport_ImportModule("unicodedata");
1880 if (m == NULL)
1881 goto ucnhashError;
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00001882 api = PyObject_GetAttrString(m, "ucnhash_CAPI");
Fredrik Lundhccc74732001-02-18 22:13:49 +00001883 Py_DECREF(m);
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00001884 if (api == NULL)
Fredrik Lundhccc74732001-02-18 22:13:49 +00001885 goto ucnhashError;
Anthony Baxtera6286212006-04-11 07:42:36 +00001886 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCObject_AsVoidPtr(api);
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00001887 Py_DECREF(api);
Fredrik Lundhccc74732001-02-18 22:13:49 +00001888 if (ucnhash_CAPI == NULL)
1889 goto ucnhashError;
1890 }
1891 if (*s == '{') {
1892 const char *start = s+1;
1893 /* look for the closing brace */
1894 while (*s != '}' && s < end)
1895 s++;
1896 if (s > start && s < end && *s == '}') {
1897 /* found a name. look it up in the unicode database */
1898 message = "unknown Unicode character name";
1899 s++;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +00001900 if (ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1), &chr))
Fredrik Lundhccc74732001-02-18 22:13:49 +00001901 goto store;
1902 }
1903 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001904 endinpos = s-starts;
1905 outpos = p-PyUnicode_AS_UNICODE(v);
1906 if (unicode_decode_call_errorhandler(
1907 errors, &errorHandler,
1908 "unicodeescape", message,
1909 starts, size, &startinpos, &endinpos, &exc, &s,
1910 (PyObject **)&v, &outpos, &p))
Fredrik Lundhccc74732001-02-18 22:13:49 +00001911 goto onError;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001912 break;
1913
1914 default:
Walter Dörwald8c077222002-03-25 11:16:18 +00001915 if (s > end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001916 message = "\\ at end of string";
1917 s--;
1918 endinpos = s-starts;
1919 outpos = p-PyUnicode_AS_UNICODE(v);
1920 if (unicode_decode_call_errorhandler(
1921 errors, &errorHandler,
1922 "unicodeescape", message,
1923 starts, size, &startinpos, &endinpos, &exc, &s,
1924 (PyObject **)&v, &outpos, &p))
Walter Dörwald8c077222002-03-25 11:16:18 +00001925 goto onError;
1926 }
1927 else {
1928 *p++ = '\\';
1929 *p++ = (unsigned char)s[-1];
1930 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00001931 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001932 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001933 nextByte:
1934 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001935 }
Martin v. Löwis412fb672006-04-13 06:34:32 +00001936 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001937 goto onError;
Walter Dörwaldd4ade082003-08-15 15:00:26 +00001938 Py_XDECREF(errorHandler);
1939 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001940 return (PyObject *)v;
Walter Dörwald8c077222002-03-25 11:16:18 +00001941
Fredrik Lundhccc74732001-02-18 22:13:49 +00001942ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00001943 PyErr_SetString(
1944 PyExc_UnicodeError,
1945 "\\N escapes not supported (can't load unicodedata module)"
1946 );
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00001947 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001948 Py_XDECREF(errorHandler);
1949 Py_XDECREF(exc);
Fredrik Lundhf6056062001-01-20 11:15:25 +00001950 return NULL;
1951
Fredrik Lundhccc74732001-02-18 22:13:49 +00001952onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001953 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001954 Py_XDECREF(errorHandler);
1955 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001956 return NULL;
1957}
1958
1959/* Return a Unicode-Escape string version of the Unicode object.
1960
1961 If quotes is true, the string is enclosed in u"" or u'' quotes as
1962 appropriate.
1963
1964*/
1965
Barry Warsaw51ac5802000-03-20 16:36:48 +00001966static const Py_UNICODE *findchar(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001967 Py_ssize_t size,
Barry Warsaw51ac5802000-03-20 16:36:48 +00001968 Py_UNICODE ch);
1969
Guido van Rossumd57fd912000-03-10 22:53:23 +00001970static
1971PyObject *unicodeescape_string(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001972 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00001973 int quotes)
1974{
1975 PyObject *repr;
1976 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001977
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00001978 static const char *hexdigit = "0123456789abcdef";
Guido van Rossumd57fd912000-03-10 22:53:23 +00001979
1980 repr = PyString_FromStringAndSize(NULL, 2 + 6*size + 1);
1981 if (repr == NULL)
1982 return NULL;
1983
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001984 p = PyString_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001985
1986 if (quotes) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00001987 *p++ = 'u';
Tim Petersced69f82003-09-16 20:30:58 +00001988 *p++ = (findchar(s, size, '\'') &&
Guido van Rossumd57fd912000-03-10 22:53:23 +00001989 !findchar(s, size, '"')) ? '"' : '\'';
1990 }
1991 while (size-- > 0) {
1992 Py_UNICODE ch = *s++;
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001993
Hye-Shik Chang835b2432005-12-17 04:38:31 +00001994 /* Escape quotes and backslashes */
1995 if ((quotes &&
1996 ch == (Py_UNICODE) PyString_AS_STRING(repr)[1]) || ch == '\\') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00001997 *p++ = '\\';
1998 *p++ = (char) ch;
Guido van Rossumad9744a2001-09-21 15:38:17 +00001999 continue;
Tim Petersced69f82003-09-16 20:30:58 +00002000 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002001
Guido van Rossum0d42e0c2001-07-20 16:36:21 +00002002#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002003 /* Map 21-bit characters to '\U00xxxxxx' */
2004 else if (ch >= 0x10000) {
Martin v. Löwis412fb672006-04-13 06:34:32 +00002005 Py_ssize_t offset = p - PyString_AS_STRING(repr);
Tim Petersced69f82003-09-16 20:30:58 +00002006
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002007 /* Resize the string if necessary */
2008 if (offset + 12 > PyString_GET_SIZE(repr)) {
2009 if (_PyString_Resize(&repr, PyString_GET_SIZE(repr) + 100))
Tim Peters5de98422002-04-27 18:44:32 +00002010 return NULL;
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002011 p = PyString_AS_STRING(repr) + offset;
2012 }
2013
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002014 *p++ = '\\';
2015 *p++ = 'U';
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002016 *p++ = hexdigit[(ch >> 28) & 0x0000000F];
2017 *p++ = hexdigit[(ch >> 24) & 0x0000000F];
2018 *p++ = hexdigit[(ch >> 20) & 0x0000000F];
2019 *p++ = hexdigit[(ch >> 16) & 0x0000000F];
2020 *p++ = hexdigit[(ch >> 12) & 0x0000000F];
2021 *p++ = hexdigit[(ch >> 8) & 0x0000000F];
2022 *p++ = hexdigit[(ch >> 4) & 0x0000000F];
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002023 *p++ = hexdigit[ch & 0x0000000F];
2024 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002025 }
Guido van Rossum0d42e0c2001-07-20 16:36:21 +00002026#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002027 /* Map UTF-16 surrogate pairs to Unicode \UXXXXXXXX escapes */
2028 else if (ch >= 0xD800 && ch < 0xDC00) {
2029 Py_UNICODE ch2;
2030 Py_UCS4 ucs;
Tim Petersced69f82003-09-16 20:30:58 +00002031
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002032 ch2 = *s++;
2033 size--;
2034 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
2035 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
2036 *p++ = '\\';
2037 *p++ = 'U';
2038 *p++ = hexdigit[(ucs >> 28) & 0x0000000F];
2039 *p++ = hexdigit[(ucs >> 24) & 0x0000000F];
2040 *p++ = hexdigit[(ucs >> 20) & 0x0000000F];
2041 *p++ = hexdigit[(ucs >> 16) & 0x0000000F];
2042 *p++ = hexdigit[(ucs >> 12) & 0x0000000F];
2043 *p++ = hexdigit[(ucs >> 8) & 0x0000000F];
2044 *p++ = hexdigit[(ucs >> 4) & 0x0000000F];
2045 *p++ = hexdigit[ucs & 0x0000000F];
2046 continue;
2047 }
2048 /* Fall through: isolated surrogates are copied as-is */
2049 s--;
2050 size++;
2051 }
2052
Guido van Rossumd57fd912000-03-10 22:53:23 +00002053 /* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002054 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002055 *p++ = '\\';
2056 *p++ = 'u';
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002057 *p++ = hexdigit[(ch >> 12) & 0x000F];
2058 *p++ = hexdigit[(ch >> 8) & 0x000F];
2059 *p++ = hexdigit[(ch >> 4) & 0x000F];
2060 *p++ = hexdigit[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00002061 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002062
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00002063 /* Map special whitespace to '\t', \n', '\r' */
2064 else if (ch == '\t') {
2065 *p++ = '\\';
2066 *p++ = 't';
2067 }
2068 else if (ch == '\n') {
2069 *p++ = '\\';
2070 *p++ = 'n';
2071 }
2072 else if (ch == '\r') {
2073 *p++ = '\\';
2074 *p++ = 'r';
2075 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002076
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00002077 /* Map non-printable US ASCII to '\xhh' */
Marc-André Lemburg11326de2001-11-28 12:56:20 +00002078 else if (ch < ' ' || ch >= 0x7F) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002079 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00002080 *p++ = 'x';
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002081 *p++ = hexdigit[(ch >> 4) & 0x000F];
2082 *p++ = hexdigit[ch & 0x000F];
Tim Petersced69f82003-09-16 20:30:58 +00002083 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002084
Guido van Rossumd57fd912000-03-10 22:53:23 +00002085 /* Copy everything else as-is */
2086 else
2087 *p++ = (char) ch;
2088 }
2089 if (quotes)
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002090 *p++ = PyString_AS_STRING(repr)[1];
Guido van Rossumd57fd912000-03-10 22:53:23 +00002091
2092 *p = '\0';
Tim Peters5de98422002-04-27 18:44:32 +00002093 _PyString_Resize(&repr, p - PyString_AS_STRING(repr));
Guido van Rossumd57fd912000-03-10 22:53:23 +00002094 return repr;
2095}
2096
2097PyObject *PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002098 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002099{
2100 return unicodeescape_string(s, size, 0);
2101}
2102
2103PyObject *PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
2104{
2105 if (!PyUnicode_Check(unicode)) {
2106 PyErr_BadArgument();
2107 return NULL;
2108 }
2109 return PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
2110 PyUnicode_GET_SIZE(unicode));
2111}
2112
2113/* --- Raw Unicode Escape Codec ------------------------------------------- */
2114
2115PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002116 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00002117 const char *errors)
2118{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002119 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002120 Py_ssize_t startinpos;
2121 Py_ssize_t endinpos;
2122 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002123 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002124 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002125 const char *end;
2126 const char *bs;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002127 PyObject *errorHandler = NULL;
2128 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00002129
Guido van Rossumd57fd912000-03-10 22:53:23 +00002130 /* Escaped strings will always be longer than the resulting
2131 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002132 length after conversion to the true value. (But decoding error
2133 handler might have to resize the string) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002134 v = _PyUnicode_New(size);
2135 if (v == NULL)
2136 goto onError;
2137 if (size == 0)
2138 return (PyObject *)v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002139 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002140 end = s + size;
2141 while (s < end) {
2142 unsigned char c;
Martin v. Löwis047c05e2002-03-21 08:55:28 +00002143 Py_UCS4 x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002144 int i;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002145 int count;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002146
2147 /* Non-escape characters are interpreted as Unicode ordinals */
2148 if (*s != '\\') {
2149 *p++ = (unsigned char)*s++;
2150 continue;
2151 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002152 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002153
2154 /* \u-escapes are only interpreted iff the number of leading
2155 backslashes if odd */
2156 bs = s;
2157 for (;s < end;) {
2158 if (*s != '\\')
2159 break;
2160 *p++ = (unsigned char)*s++;
2161 }
2162 if (((s - bs) & 1) == 0 ||
2163 s >= end ||
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002164 (*s != 'u' && *s != 'U')) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002165 continue;
2166 }
2167 p--;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002168 count = *s=='u' ? 4 : 8;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002169 s++;
2170
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002171 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002172 outpos = p-PyUnicode_AS_UNICODE(v);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002173 for (x = 0, i = 0; i < count; ++i, ++s) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002174 c = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002175 if (!isxdigit(c)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002176 endinpos = s-starts;
2177 if (unicode_decode_call_errorhandler(
2178 errors, &errorHandler,
2179 "rawunicodeescape", "truncated \\uXXXX",
2180 starts, size, &startinpos, &endinpos, &exc, &s,
2181 (PyObject **)&v, &outpos, &p))
Guido van Rossumd57fd912000-03-10 22:53:23 +00002182 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002183 goto nextByte;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002184 }
2185 x = (x<<4) & ~0xF;
2186 if (c >= '0' && c <= '9')
2187 x += c - '0';
2188 else if (c >= 'a' && c <= 'f')
2189 x += 10 + c - 'a';
2190 else
2191 x += 10 + c - 'A';
2192 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002193#ifndef Py_UNICODE_WIDE
2194 if (x > 0x10000) {
2195 if (unicode_decode_call_errorhandler(
2196 errors, &errorHandler,
2197 "rawunicodeescape", "\\Uxxxxxxxx out of range",
2198 starts, size, &startinpos, &endinpos, &exc, &s,
2199 (PyObject **)&v, &outpos, &p))
2200 goto onError;
2201 }
2202#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002203 *p++ = x;
2204 nextByte:
2205 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002206 }
Martin v. Löwis412fb672006-04-13 06:34:32 +00002207 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002208 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002209 Py_XDECREF(errorHandler);
2210 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002211 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00002212
Guido van Rossumd57fd912000-03-10 22:53:23 +00002213 onError:
2214 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002215 Py_XDECREF(errorHandler);
2216 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002217 return NULL;
2218}
2219
2220PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002221 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002222{
2223 PyObject *repr;
2224 char *p;
2225 char *q;
2226
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00002227 static const char *hexdigit = "0123456789abcdef";
Guido van Rossumd57fd912000-03-10 22:53:23 +00002228
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002229#ifdef Py_UNICODE_WIDE
2230 repr = PyString_FromStringAndSize(NULL, 10 * size);
2231#else
Guido van Rossumd57fd912000-03-10 22:53:23 +00002232 repr = PyString_FromStringAndSize(NULL, 6 * size);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002233#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00002234 if (repr == NULL)
2235 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00002236 if (size == 0)
2237 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002238
2239 p = q = PyString_AS_STRING(repr);
2240 while (size-- > 0) {
2241 Py_UNICODE ch = *s++;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002242#ifdef Py_UNICODE_WIDE
2243 /* Map 32-bit characters to '\Uxxxxxxxx' */
2244 if (ch >= 0x10000) {
2245 *p++ = '\\';
2246 *p++ = 'U';
2247 *p++ = hexdigit[(ch >> 28) & 0xf];
2248 *p++ = hexdigit[(ch >> 24) & 0xf];
2249 *p++ = hexdigit[(ch >> 20) & 0xf];
2250 *p++ = hexdigit[(ch >> 16) & 0xf];
2251 *p++ = hexdigit[(ch >> 12) & 0xf];
2252 *p++ = hexdigit[(ch >> 8) & 0xf];
2253 *p++ = hexdigit[(ch >> 4) & 0xf];
2254 *p++ = hexdigit[ch & 15];
Tim Petersced69f82003-09-16 20:30:58 +00002255 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002256 else
2257#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00002258 /* Map 16-bit characters to '\uxxxx' */
2259 if (ch >= 256) {
2260 *p++ = '\\';
2261 *p++ = 'u';
2262 *p++ = hexdigit[(ch >> 12) & 0xf];
2263 *p++ = hexdigit[(ch >> 8) & 0xf];
2264 *p++ = hexdigit[(ch >> 4) & 0xf];
2265 *p++ = hexdigit[ch & 15];
2266 }
2267 /* Copy everything else as-is */
2268 else
2269 *p++ = (char) ch;
2270 }
2271 *p = '\0';
Tim Peters5de98422002-04-27 18:44:32 +00002272 _PyString_Resize(&repr, p - q);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002273 return repr;
2274}
2275
2276PyObject *PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
2277{
2278 if (!PyUnicode_Check(unicode)) {
2279 PyErr_BadArgument();
2280 return NULL;
2281 }
2282 return PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
2283 PyUnicode_GET_SIZE(unicode));
2284}
2285
Walter Dörwalda47d1c02005-08-30 10:23:14 +00002286/* --- Unicode Internal Codec ------------------------------------------- */
2287
2288PyObject *_PyUnicode_DecodeUnicodeInternal(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002289 Py_ssize_t size,
Walter Dörwalda47d1c02005-08-30 10:23:14 +00002290 const char *errors)
2291{
2292 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002293 Py_ssize_t startinpos;
2294 Py_ssize_t endinpos;
2295 Py_ssize_t outpos;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00002296 PyUnicodeObject *v;
2297 Py_UNICODE *p;
2298 const char *end;
2299 const char *reason;
2300 PyObject *errorHandler = NULL;
2301 PyObject *exc = NULL;
2302
Neal Norwitzd43069c2006-01-08 01:12:10 +00002303#ifdef Py_UNICODE_WIDE
2304 Py_UNICODE unimax = PyUnicode_GetMax();
2305#endif
2306
Walter Dörwalda47d1c02005-08-30 10:23:14 +00002307 v = _PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE);
2308 if (v == NULL)
2309 goto onError;
2310 if (PyUnicode_GetSize((PyObject *)v) == 0)
2311 return (PyObject *)v;
2312 p = PyUnicode_AS_UNICODE(v);
2313 end = s + size;
2314
2315 while (s < end) {
Neal Norwitz1004a532006-05-15 07:17:23 +00002316 memcpy(p, s, sizeof(Py_UNICODE));
Walter Dörwalda47d1c02005-08-30 10:23:14 +00002317 /* We have to sanity check the raw data, otherwise doom looms for
2318 some malformed UCS-4 data. */
2319 if (
2320 #ifdef Py_UNICODE_WIDE
2321 *p > unimax || *p < 0 ||
2322 #endif
2323 end-s < Py_UNICODE_SIZE
2324 )
2325 {
2326 startinpos = s - starts;
2327 if (end-s < Py_UNICODE_SIZE) {
2328 endinpos = end-starts;
2329 reason = "truncated input";
2330 }
2331 else {
2332 endinpos = s - starts + Py_UNICODE_SIZE;
2333 reason = "illegal code point (> 0x10FFFF)";
2334 }
2335 outpos = p - PyUnicode_AS_UNICODE(v);
2336 if (unicode_decode_call_errorhandler(
2337 errors, &errorHandler,
2338 "unicode_internal", reason,
2339 starts, size, &startinpos, &endinpos, &exc, &s,
2340 (PyObject **)&v, &outpos, &p)) {
2341 goto onError;
2342 }
2343 }
2344 else {
2345 p++;
2346 s += Py_UNICODE_SIZE;
2347 }
2348 }
2349
Martin v. Löwis412fb672006-04-13 06:34:32 +00002350 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00002351 goto onError;
2352 Py_XDECREF(errorHandler);
2353 Py_XDECREF(exc);
2354 return (PyObject *)v;
2355
2356 onError:
2357 Py_XDECREF(v);
2358 Py_XDECREF(errorHandler);
2359 Py_XDECREF(exc);
2360 return NULL;
2361}
2362
Guido van Rossumd57fd912000-03-10 22:53:23 +00002363/* --- Latin-1 Codec ------------------------------------------------------ */
2364
2365PyObject *PyUnicode_DecodeLatin1(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002366 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00002367 const char *errors)
2368{
2369 PyUnicodeObject *v;
2370 Py_UNICODE *p;
Tim Petersced69f82003-09-16 20:30:58 +00002371
Guido van Rossumd57fd912000-03-10 22:53:23 +00002372 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002373 if (size == 1) {
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002374 Py_UNICODE r = *(unsigned char*)s;
2375 return PyUnicode_FromUnicode(&r, 1);
2376 }
2377
Guido van Rossumd57fd912000-03-10 22:53:23 +00002378 v = _PyUnicode_New(size);
2379 if (v == NULL)
2380 goto onError;
2381 if (size == 0)
2382 return (PyObject *)v;
2383 p = PyUnicode_AS_UNICODE(v);
2384 while (size-- > 0)
2385 *p++ = (unsigned char)*s++;
2386 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00002387
Guido van Rossumd57fd912000-03-10 22:53:23 +00002388 onError:
2389 Py_XDECREF(v);
2390 return NULL;
2391}
2392
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002393/* create or adjust a UnicodeEncodeError */
2394static void make_encode_exception(PyObject **exceptionObject,
2395 const char *encoding,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002396 const Py_UNICODE *unicode, Py_ssize_t size,
2397 Py_ssize_t startpos, Py_ssize_t endpos,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002398 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002399{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002400 if (*exceptionObject == NULL) {
2401 *exceptionObject = PyUnicodeEncodeError_Create(
2402 encoding, unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002403 }
2404 else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002405 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
2406 goto onError;
2407 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
2408 goto onError;
2409 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
2410 goto onError;
2411 return;
2412 onError:
2413 Py_DECREF(*exceptionObject);
2414 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002415 }
2416}
2417
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002418/* raises a UnicodeEncodeError */
2419static void raise_encode_exception(PyObject **exceptionObject,
2420 const char *encoding,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002421 const Py_UNICODE *unicode, Py_ssize_t size,
2422 Py_ssize_t startpos, Py_ssize_t endpos,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002423 const char *reason)
2424{
2425 make_encode_exception(exceptionObject,
2426 encoding, unicode, size, startpos, endpos, reason);
2427 if (*exceptionObject != NULL)
2428 PyCodec_StrictErrors(*exceptionObject);
2429}
2430
2431/* error handling callback helper:
2432 build arguments, call the callback and check the arguments,
2433 put the result into newpos and return the replacement string, which
2434 has to be freed by the caller */
2435static PyObject *unicode_encode_call_errorhandler(const char *errors,
2436 PyObject **errorHandler,
2437 const char *encoding, const char *reason,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002438 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
2439 Py_ssize_t startpos, Py_ssize_t endpos,
2440 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002441{
Martin v. Löwis18e16552006-02-15 17:27:45 +00002442 static char *argparse = "O!n;encoding error handler must return (unicode, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002443
2444 PyObject *restuple;
2445 PyObject *resunicode;
2446
2447 if (*errorHandler == NULL) {
2448 *errorHandler = PyCodec_LookupError(errors);
2449 if (*errorHandler == NULL)
2450 return NULL;
2451 }
2452
2453 make_encode_exception(exceptionObject,
2454 encoding, unicode, size, startpos, endpos, reason);
2455 if (*exceptionObject == NULL)
2456 return NULL;
2457
2458 restuple = PyObject_CallFunctionObjArgs(
2459 *errorHandler, *exceptionObject, NULL);
2460 if (restuple == NULL)
2461 return NULL;
2462 if (!PyTuple_Check(restuple)) {
2463 PyErr_Format(PyExc_TypeError, &argparse[4]);
2464 Py_DECREF(restuple);
2465 return NULL;
2466 }
2467 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
2468 &resunicode, newpos)) {
2469 Py_DECREF(restuple);
2470 return NULL;
2471 }
2472 if (*newpos<0)
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00002473 *newpos = size+*newpos;
2474 if (*newpos<0 || *newpos>size) {
Martin v. Löwis2c95cc62006-02-16 06:54:25 +00002475 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00002476 Py_DECREF(restuple);
2477 return NULL;
2478 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002479 Py_INCREF(resunicode);
2480 Py_DECREF(restuple);
2481 return resunicode;
2482}
2483
2484static PyObject *unicode_encode_ucs1(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002485 Py_ssize_t size,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002486 const char *errors,
2487 int limit)
2488{
2489 /* output object */
2490 PyObject *res;
2491 /* pointers to the beginning and end+1 of input */
2492 const Py_UNICODE *startp = p;
2493 const Py_UNICODE *endp = p + size;
2494 /* pointer to the beginning of the unencodable characters */
2495 /* const Py_UNICODE *badp = NULL; */
2496 /* pointer into the output */
2497 char *str;
2498 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00002499 Py_ssize_t respos = 0;
2500 Py_ssize_t ressize;
Anthony Baxtera6286212006-04-11 07:42:36 +00002501 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
2502 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002503 PyObject *errorHandler = NULL;
2504 PyObject *exc = NULL;
2505 /* the following variable is used for caching string comparisons
2506 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
2507 int known_errorHandler = -1;
2508
2509 /* allocate enough for a simple encoding without
2510 replacements, if we need more, we'll resize */
2511 res = PyString_FromStringAndSize(NULL, size);
2512 if (res == NULL)
2513 goto onError;
2514 if (size == 0)
2515 return res;
2516 str = PyString_AS_STRING(res);
2517 ressize = size;
2518
2519 while (p<endp) {
2520 Py_UNICODE c = *p;
2521
2522 /* can we encode this? */
2523 if (c<limit) {
2524 /* no overflow check, because we know that the space is enough */
2525 *str++ = (char)c;
2526 ++p;
2527 }
2528 else {
Martin v. Löwis18e16552006-02-15 17:27:45 +00002529 Py_ssize_t unicodepos = p-startp;
2530 Py_ssize_t requiredsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002531 PyObject *repunicode;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002532 Py_ssize_t repsize;
2533 Py_ssize_t newpos;
2534 Py_ssize_t respos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002535 Py_UNICODE *uni2;
2536 /* startpos for collecting unencodable chars */
2537 const Py_UNICODE *collstart = p;
2538 const Py_UNICODE *collend = p;
2539 /* find all unecodable characters */
2540 while ((collend < endp) && ((*collend)>=limit))
2541 ++collend;
2542 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
2543 if (known_errorHandler==-1) {
2544 if ((errors==NULL) || (!strcmp(errors, "strict")))
2545 known_errorHandler = 1;
2546 else if (!strcmp(errors, "replace"))
2547 known_errorHandler = 2;
2548 else if (!strcmp(errors, "ignore"))
2549 known_errorHandler = 3;
2550 else if (!strcmp(errors, "xmlcharrefreplace"))
2551 known_errorHandler = 4;
2552 else
2553 known_errorHandler = 0;
2554 }
2555 switch (known_errorHandler) {
2556 case 1: /* strict */
2557 raise_encode_exception(&exc, encoding, startp, size, collstart-startp, collend-startp, reason);
2558 goto onError;
2559 case 2: /* replace */
2560 while (collstart++<collend)
2561 *str++ = '?'; /* fall through */
2562 case 3: /* ignore */
2563 p = collend;
2564 break;
2565 case 4: /* xmlcharrefreplace */
2566 respos = str-PyString_AS_STRING(res);
2567 /* determine replacement size (temporarily (mis)uses p) */
2568 for (p = collstart, repsize = 0; p < collend; ++p) {
2569 if (*p<10)
2570 repsize += 2+1+1;
2571 else if (*p<100)
2572 repsize += 2+2+1;
2573 else if (*p<1000)
2574 repsize += 2+3+1;
2575 else if (*p<10000)
2576 repsize += 2+4+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00002577#ifndef Py_UNICODE_WIDE
2578 else
2579 repsize += 2+5+1;
2580#else
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002581 else if (*p<100000)
2582 repsize += 2+5+1;
2583 else if (*p<1000000)
2584 repsize += 2+6+1;
2585 else
2586 repsize += 2+7+1;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002587#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002588 }
2589 requiredsize = respos+repsize+(endp-collend);
2590 if (requiredsize > ressize) {
2591 if (requiredsize<2*ressize)
2592 requiredsize = 2*ressize;
2593 if (_PyString_Resize(&res, requiredsize))
2594 goto onError;
2595 str = PyString_AS_STRING(res) + respos;
2596 ressize = requiredsize;
2597 }
2598 /* generate replacement (temporarily (mis)uses p) */
2599 for (p = collstart; p < collend; ++p) {
2600 str += sprintf(str, "&#%d;", (int)*p);
2601 }
2602 p = collend;
2603 break;
2604 default:
2605 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
2606 encoding, reason, startp, size, &exc,
2607 collstart-startp, collend-startp, &newpos);
2608 if (repunicode == NULL)
2609 goto onError;
2610 /* need more space? (at least enough for what we
2611 have+the replacement+the rest of the string, so
2612 we won't have to check space for encodable characters) */
2613 respos = str-PyString_AS_STRING(res);
2614 repsize = PyUnicode_GET_SIZE(repunicode);
2615 requiredsize = respos+repsize+(endp-collend);
2616 if (requiredsize > ressize) {
2617 if (requiredsize<2*ressize)
2618 requiredsize = 2*ressize;
2619 if (_PyString_Resize(&res, requiredsize)) {
2620 Py_DECREF(repunicode);
2621 goto onError;
2622 }
2623 str = PyString_AS_STRING(res) + respos;
2624 ressize = requiredsize;
2625 }
2626 /* check if there is anything unencodable in the replacement
2627 and copy it to the output */
2628 for (uni2 = PyUnicode_AS_UNICODE(repunicode);repsize-->0; ++uni2, ++str) {
2629 c = *uni2;
2630 if (c >= limit) {
2631 raise_encode_exception(&exc, encoding, startp, size,
2632 unicodepos, unicodepos+1, reason);
2633 Py_DECREF(repunicode);
2634 goto onError;
2635 }
2636 *str = (char)c;
2637 }
2638 p = startp + newpos;
2639 Py_DECREF(repunicode);
2640 }
2641 }
2642 }
2643 /* Resize if we allocated to much */
2644 respos = str-PyString_AS_STRING(res);
2645 if (respos<ressize)
2646 /* If this falls res will be NULL */
2647 _PyString_Resize(&res, respos);
2648 Py_XDECREF(errorHandler);
2649 Py_XDECREF(exc);
2650 return res;
2651
2652 onError:
2653 Py_XDECREF(res);
2654 Py_XDECREF(errorHandler);
2655 Py_XDECREF(exc);
2656 return NULL;
2657}
2658
Guido van Rossumd57fd912000-03-10 22:53:23 +00002659PyObject *PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002660 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00002661 const char *errors)
2662{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002663 return unicode_encode_ucs1(p, size, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002664}
2665
2666PyObject *PyUnicode_AsLatin1String(PyObject *unicode)
2667{
2668 if (!PyUnicode_Check(unicode)) {
2669 PyErr_BadArgument();
2670 return NULL;
2671 }
2672 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
2673 PyUnicode_GET_SIZE(unicode),
2674 NULL);
2675}
2676
2677/* --- 7-bit ASCII Codec -------------------------------------------------- */
2678
Guido van Rossumd57fd912000-03-10 22:53:23 +00002679PyObject *PyUnicode_DecodeASCII(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002680 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00002681 const char *errors)
2682{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002683 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002684 PyUnicodeObject *v;
2685 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002686 Py_ssize_t startinpos;
2687 Py_ssize_t endinpos;
2688 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002689 const char *e;
2690 PyObject *errorHandler = NULL;
2691 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00002692
Guido van Rossumd57fd912000-03-10 22:53:23 +00002693 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002694 if (size == 1 && *(unsigned char*)s < 128) {
2695 Py_UNICODE r = *(unsigned char*)s;
2696 return PyUnicode_FromUnicode(&r, 1);
2697 }
Tim Petersced69f82003-09-16 20:30:58 +00002698
Guido van Rossumd57fd912000-03-10 22:53:23 +00002699 v = _PyUnicode_New(size);
2700 if (v == NULL)
2701 goto onError;
2702 if (size == 0)
2703 return (PyObject *)v;
2704 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002705 e = s + size;
2706 while (s < e) {
2707 register unsigned char c = (unsigned char)*s;
2708 if (c < 128) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002709 *p++ = c;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002710 ++s;
2711 }
2712 else {
2713 startinpos = s-starts;
2714 endinpos = startinpos + 1;
Jeremy Hyltond8082792003-09-16 19:41:39 +00002715 outpos = p - (Py_UNICODE *)PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002716 if (unicode_decode_call_errorhandler(
2717 errors, &errorHandler,
2718 "ascii", "ordinal not in range(128)",
2719 starts, size, &startinpos, &endinpos, &exc, &s,
2720 (PyObject **)&v, &outpos, &p))
Guido van Rossumd57fd912000-03-10 22:53:23 +00002721 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002722 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002723 }
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002724 if (p - PyUnicode_AS_UNICODE(v) < PyString_GET_SIZE(v))
Martin v. Löwis412fb672006-04-13 06:34:32 +00002725 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002726 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002727 Py_XDECREF(errorHandler);
2728 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002729 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00002730
Guido van Rossumd57fd912000-03-10 22:53:23 +00002731 onError:
2732 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002733 Py_XDECREF(errorHandler);
2734 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002735 return NULL;
2736}
2737
Guido van Rossumd57fd912000-03-10 22:53:23 +00002738PyObject *PyUnicode_EncodeASCII(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002739 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00002740 const char *errors)
2741{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002742 return unicode_encode_ucs1(p, size, errors, 128);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002743}
2744
2745PyObject *PyUnicode_AsASCIIString(PyObject *unicode)
2746{
2747 if (!PyUnicode_Check(unicode)) {
2748 PyErr_BadArgument();
2749 return NULL;
2750 }
2751 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
2752 PyUnicode_GET_SIZE(unicode),
2753 NULL);
2754}
2755
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00002756#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Guido van Rossum2ea3e142000-03-31 17:24:09 +00002757
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002758/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00002759
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002760PyObject *PyUnicode_DecodeMBCS(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002761 Py_ssize_t size,
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002762 const char *errors)
2763{
2764 PyUnicodeObject *v;
2765 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002766 DWORD usize;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002767
2768 /* First get the size of the result */
Martin v. Löwis18e16552006-02-15 17:27:45 +00002769 assert(size < INT_MAX);
2770 usize = MultiByteToWideChar(CP_ACP, 0, s, (int)size, NULL, 0);
Guido van Rossum03e29f12000-05-04 15:52:20 +00002771 if (size > 0 && usize==0)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002772 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
2773
2774 v = _PyUnicode_New(usize);
2775 if (v == NULL)
2776 return NULL;
2777 if (usize == 0)
2778 return (PyObject *)v;
2779 p = PyUnicode_AS_UNICODE(v);
Martin v. Löwis18e16552006-02-15 17:27:45 +00002780 if (0 == MultiByteToWideChar(CP_ACP, 0, s, (int)size, p, usize)) {
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002781 Py_DECREF(v);
2782 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
2783 }
2784
2785 return (PyObject *)v;
2786}
2787
2788PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002789 Py_ssize_t size,
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002790 const char *errors)
2791{
2792 PyObject *repr;
2793 char *s;
Guido van Rossum03e29f12000-05-04 15:52:20 +00002794 DWORD mbcssize;
2795
2796 /* If there are no characters, bail now! */
2797 if (size==0)
2798 return PyString_FromString("");
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002799
2800 /* First get the size of the result */
Martin v. Löwis18e16552006-02-15 17:27:45 +00002801 assert(size<INT_MAX);
2802 mbcssize = WideCharToMultiByte(CP_ACP, 0, p, (int)size, NULL, 0, NULL, NULL);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002803 if (mbcssize==0)
2804 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
2805
2806 repr = PyString_FromStringAndSize(NULL, mbcssize);
2807 if (repr == NULL)
2808 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00002809 if (mbcssize == 0)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002810 return repr;
2811
2812 /* Do the conversion */
2813 s = PyString_AS_STRING(repr);
Martin v. Löwis18e16552006-02-15 17:27:45 +00002814 assert(size < INT_MAX);
2815 if (0 == WideCharToMultiByte(CP_ACP, 0, p, (int)size, s, mbcssize, NULL, NULL)) {
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002816 Py_DECREF(repr);
2817 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
2818 }
2819 return repr;
2820}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00002821
Mark Hammond0ccda1e2003-07-01 00:13:27 +00002822PyObject *PyUnicode_AsMBCSString(PyObject *unicode)
2823{
2824 if (!PyUnicode_Check(unicode)) {
2825 PyErr_BadArgument();
2826 return NULL;
2827 }
2828 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
2829 PyUnicode_GET_SIZE(unicode),
2830 NULL);
2831}
2832
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00002833#endif /* MS_WINDOWS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002834
Guido van Rossumd57fd912000-03-10 22:53:23 +00002835/* --- Character Mapping Codec -------------------------------------------- */
2836
Guido van Rossumd57fd912000-03-10 22:53:23 +00002837PyObject *PyUnicode_DecodeCharmap(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002838 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00002839 PyObject *mapping,
2840 const char *errors)
2841{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002842 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002843 Py_ssize_t startinpos;
2844 Py_ssize_t endinpos;
2845 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002846 const char *e;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002847 PyUnicodeObject *v;
2848 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002849 Py_ssize_t extrachars = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002850 PyObject *errorHandler = NULL;
2851 PyObject *exc = NULL;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00002852 Py_UNICODE *mapstring = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002853 Py_ssize_t maplen = 0;
Tim Petersced69f82003-09-16 20:30:58 +00002854
Guido van Rossumd57fd912000-03-10 22:53:23 +00002855 /* Default to Latin-1 */
2856 if (mapping == NULL)
2857 return PyUnicode_DecodeLatin1(s, size, errors);
2858
2859 v = _PyUnicode_New(size);
2860 if (v == NULL)
2861 goto onError;
2862 if (size == 0)
2863 return (PyObject *)v;
2864 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002865 e = s + size;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00002866 if (PyUnicode_CheckExact(mapping)) {
2867 mapstring = PyUnicode_AS_UNICODE(mapping);
2868 maplen = PyUnicode_GET_SIZE(mapping);
2869 while (s < e) {
2870 unsigned char ch = *s;
2871 Py_UNICODE x = 0xfffe; /* illegal value */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002872
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00002873 if (ch < maplen)
2874 x = mapstring[ch];
Guido van Rossumd57fd912000-03-10 22:53:23 +00002875
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00002876 if (x == 0xfffe) {
2877 /* undefined mapping */
2878 outpos = p-PyUnicode_AS_UNICODE(v);
2879 startinpos = s-starts;
2880 endinpos = startinpos+1;
2881 if (unicode_decode_call_errorhandler(
2882 errors, &errorHandler,
2883 "charmap", "character maps to <undefined>",
2884 starts, size, &startinpos, &endinpos, &exc, &s,
2885 (PyObject **)&v, &outpos, &p)) {
2886 goto onError;
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002887 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00002888 continue;
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002889 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00002890 *p++ = x;
2891 ++s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002892 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00002893 }
2894 else {
2895 while (s < e) {
2896 unsigned char ch = *s;
2897 PyObject *w, *x;
2898
2899 /* Get mapping (char ordinal -> integer, Unicode char or None) */
2900 w = PyInt_FromLong((long)ch);
2901 if (w == NULL)
2902 goto onError;
2903 x = PyObject_GetItem(mapping, w);
2904 Py_DECREF(w);
2905 if (x == NULL) {
2906 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
2907 /* No mapping found means: mapping is undefined. */
2908 PyErr_Clear();
2909 x = Py_None;
2910 Py_INCREF(x);
2911 } else
2912 goto onError;
2913 }
2914
2915 /* Apply mapping */
2916 if (PyInt_Check(x)) {
2917 long value = PyInt_AS_LONG(x);
2918 if (value < 0 || value > 65535) {
2919 PyErr_SetString(PyExc_TypeError,
2920 "character mapping must be in range(65536)");
2921 Py_DECREF(x);
2922 goto onError;
2923 }
2924 *p++ = (Py_UNICODE)value;
2925 }
2926 else if (x == Py_None) {
2927 /* undefined mapping */
2928 outpos = p-PyUnicode_AS_UNICODE(v);
2929 startinpos = s-starts;
2930 endinpos = startinpos+1;
2931 if (unicode_decode_call_errorhandler(
2932 errors, &errorHandler,
2933 "charmap", "character maps to <undefined>",
2934 starts, size, &startinpos, &endinpos, &exc, &s,
2935 (PyObject **)&v, &outpos, &p)) {
2936 Py_DECREF(x);
2937 goto onError;
2938 }
Walter Dörwaldd4fff172005-11-28 22:15:56 +00002939 Py_DECREF(x);
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00002940 continue;
2941 }
2942 else if (PyUnicode_Check(x)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00002943 Py_ssize_t targetsize = PyUnicode_GET_SIZE(x);
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00002944
2945 if (targetsize == 1)
2946 /* 1-1 mapping */
2947 *p++ = *PyUnicode_AS_UNICODE(x);
2948
2949 else if (targetsize > 1) {
2950 /* 1-n mapping */
2951 if (targetsize > extrachars) {
2952 /* resize first */
Martin v. Löwis18e16552006-02-15 17:27:45 +00002953 Py_ssize_t oldpos = p - PyUnicode_AS_UNICODE(v);
2954 Py_ssize_t needed = (targetsize - extrachars) + \
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00002955 (targetsize << 2);
2956 extrachars += needed;
2957 if (_PyUnicode_Resize(&v,
2958 PyUnicode_GET_SIZE(v) + needed) < 0) {
2959 Py_DECREF(x);
2960 goto onError;
2961 }
2962 p = PyUnicode_AS_UNICODE(v) + oldpos;
2963 }
2964 Py_UNICODE_COPY(p,
2965 PyUnicode_AS_UNICODE(x),
2966 targetsize);
2967 p += targetsize;
2968 extrachars -= targetsize;
2969 }
2970 /* 1-0 mapping: skip the character */
2971 }
2972 else {
2973 /* wrong return value */
2974 PyErr_SetString(PyExc_TypeError,
2975 "character mapping must return integer, None or unicode");
2976 Py_DECREF(x);
2977 goto onError;
2978 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002979 Py_DECREF(x);
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00002980 ++s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002981 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002982 }
2983 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Martin v. Löwis412fb672006-04-13 06:34:32 +00002984 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002985 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002986 Py_XDECREF(errorHandler);
2987 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002988 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00002989
Guido van Rossumd57fd912000-03-10 22:53:23 +00002990 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002991 Py_XDECREF(errorHandler);
2992 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002993 Py_XDECREF(v);
2994 return NULL;
2995}
2996
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002997/* Lookup the character ch in the mapping. If the character
2998 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00002999 error occurred). */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003000static PyObject *charmapencode_lookup(Py_UNICODE c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003001{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003002 PyObject *w = PyInt_FromLong((long)c);
3003 PyObject *x;
3004
3005 if (w == NULL)
3006 return NULL;
3007 x = PyObject_GetItem(mapping, w);
3008 Py_DECREF(w);
3009 if (x == NULL) {
3010 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
3011 /* No mapping found means: mapping is undefined. */
3012 PyErr_Clear();
3013 x = Py_None;
3014 Py_INCREF(x);
3015 return x;
3016 } else
3017 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003018 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00003019 else if (x == Py_None)
3020 return x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003021 else if (PyInt_Check(x)) {
3022 long value = PyInt_AS_LONG(x);
3023 if (value < 0 || value > 255) {
3024 PyErr_SetString(PyExc_TypeError,
3025 "character mapping must be in range(256)");
3026 Py_DECREF(x);
3027 return NULL;
3028 }
3029 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003030 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003031 else if (PyString_Check(x))
3032 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003033 else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003034 /* wrong return value */
3035 PyErr_SetString(PyExc_TypeError,
3036 "character mapping must return integer, None or str");
3037 Py_DECREF(x);
3038 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003039 }
3040}
3041
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003042/* lookup the character, put the result in the output string and adjust
3043 various state variables. Reallocate the output string if not enough
3044 space is available. Return a new reference to the object that
3045 was put in the output buffer, or Py_None, if the mapping was undefined
3046 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00003047 reallocation error occurred. The caller must decref the result */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003048static
3049PyObject *charmapencode_output(Py_UNICODE c, PyObject *mapping,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003050 PyObject **outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003051{
3052 PyObject *rep = charmapencode_lookup(c, mapping);
3053
3054 if (rep==NULL)
3055 return NULL;
3056 else if (rep==Py_None)
3057 return rep;
3058 else {
3059 char *outstart = PyString_AS_STRING(*outobj);
Martin v. Löwis18e16552006-02-15 17:27:45 +00003060 Py_ssize_t outsize = PyString_GET_SIZE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003061 if (PyInt_Check(rep)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00003062 Py_ssize_t requiredsize = *outpos+1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003063 if (outsize<requiredsize) {
3064 /* exponentially overallocate to minimize reallocations */
3065 if (requiredsize < 2*outsize)
3066 requiredsize = 2*outsize;
3067 if (_PyString_Resize(outobj, requiredsize)) {
3068 Py_DECREF(rep);
3069 return NULL;
3070 }
3071 outstart = PyString_AS_STRING(*outobj);
3072 }
3073 outstart[(*outpos)++] = (char)PyInt_AS_LONG(rep);
3074 }
3075 else {
3076 const char *repchars = PyString_AS_STRING(rep);
Martin v. Löwis18e16552006-02-15 17:27:45 +00003077 Py_ssize_t repsize = PyString_GET_SIZE(rep);
3078 Py_ssize_t requiredsize = *outpos+repsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003079 if (outsize<requiredsize) {
3080 /* exponentially overallocate to minimize reallocations */
3081 if (requiredsize < 2*outsize)
3082 requiredsize = 2*outsize;
3083 if (_PyString_Resize(outobj, requiredsize)) {
3084 Py_DECREF(rep);
3085 return NULL;
3086 }
3087 outstart = PyString_AS_STRING(*outobj);
3088 }
3089 memcpy(outstart + *outpos, repchars, repsize);
3090 *outpos += repsize;
3091 }
3092 }
3093 return rep;
3094}
3095
3096/* handle an error in PyUnicode_EncodeCharmap
3097 Return 0 on success, -1 on error */
3098static
3099int charmap_encoding_error(
Martin v. Löwis18e16552006-02-15 17:27:45 +00003100 const Py_UNICODE *p, Py_ssize_t size, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003101 PyObject **exceptionObject,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00003102 int *known_errorHandler, PyObject **errorHandler, const char *errors,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003103 PyObject **res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003104{
3105 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003106 Py_ssize_t repsize;
3107 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003108 Py_UNICODE *uni2;
3109 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003110 Py_ssize_t collstartpos = *inpos;
3111 Py_ssize_t collendpos = *inpos+1;
3112 Py_ssize_t collpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003113 char *encoding = "charmap";
3114 char *reason = "character maps to <undefined>";
3115
3116 PyObject *x;
3117 /* find all unencodable characters */
3118 while (collendpos < size) {
3119 x = charmapencode_lookup(p[collendpos], mapping);
3120 if (x==NULL)
3121 return -1;
3122 else if (x!=Py_None) {
3123 Py_DECREF(x);
3124 break;
3125 }
3126 Py_DECREF(x);
3127 ++collendpos;
3128 }
3129 /* cache callback name lookup
3130 * (if not done yet, i.e. it's the first error) */
3131 if (*known_errorHandler==-1) {
3132 if ((errors==NULL) || (!strcmp(errors, "strict")))
3133 *known_errorHandler = 1;
3134 else if (!strcmp(errors, "replace"))
3135 *known_errorHandler = 2;
3136 else if (!strcmp(errors, "ignore"))
3137 *known_errorHandler = 3;
3138 else if (!strcmp(errors, "xmlcharrefreplace"))
3139 *known_errorHandler = 4;
3140 else
3141 *known_errorHandler = 0;
3142 }
3143 switch (*known_errorHandler) {
3144 case 1: /* strict */
3145 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
3146 return -1;
3147 case 2: /* replace */
3148 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
3149 x = charmapencode_output('?', mapping, res, respos);
3150 if (x==NULL) {
3151 return -1;
3152 }
3153 else if (x==Py_None) {
3154 Py_DECREF(x);
3155 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
3156 return -1;
3157 }
3158 Py_DECREF(x);
3159 }
3160 /* fall through */
3161 case 3: /* ignore */
3162 *inpos = collendpos;
3163 break;
3164 case 4: /* xmlcharrefreplace */
3165 /* generate replacement (temporarily (mis)uses p) */
3166 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
3167 char buffer[2+29+1+1];
3168 char *cp;
3169 sprintf(buffer, "&#%d;", (int)p[collpos]);
3170 for (cp = buffer; *cp; ++cp) {
3171 x = charmapencode_output(*cp, mapping, res, respos);
3172 if (x==NULL)
3173 return -1;
3174 else if (x==Py_None) {
3175 Py_DECREF(x);
3176 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
3177 return -1;
3178 }
3179 Py_DECREF(x);
3180 }
3181 }
3182 *inpos = collendpos;
3183 break;
3184 default:
Walter Dörwalde5402fb2003-08-14 20:25:29 +00003185 repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003186 encoding, reason, p, size, exceptionObject,
3187 collstartpos, collendpos, &newpos);
3188 if (repunicode == NULL)
3189 return -1;
3190 /* generate replacement */
3191 repsize = PyUnicode_GET_SIZE(repunicode);
3192 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
3193 x = charmapencode_output(*uni2, mapping, res, respos);
3194 if (x==NULL) {
3195 Py_DECREF(repunicode);
3196 return -1;
3197 }
3198 else if (x==Py_None) {
3199 Py_DECREF(repunicode);
3200 Py_DECREF(x);
3201 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
3202 return -1;
3203 }
3204 Py_DECREF(x);
3205 }
3206 *inpos = newpos;
3207 Py_DECREF(repunicode);
3208 }
3209 return 0;
3210}
3211
Guido van Rossumd57fd912000-03-10 22:53:23 +00003212PyObject *PyUnicode_EncodeCharmap(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003213 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003214 PyObject *mapping,
3215 const char *errors)
3216{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003217 /* output object */
3218 PyObject *res = NULL;
3219 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003220 Py_ssize_t inpos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003221 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003222 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003223 PyObject *errorHandler = NULL;
3224 PyObject *exc = NULL;
3225 /* the following variable is used for caching string comparisons
3226 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
3227 * 3=ignore, 4=xmlcharrefreplace */
3228 int known_errorHandler = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003229
3230 /* Default to Latin-1 */
3231 if (mapping == NULL)
3232 return PyUnicode_EncodeLatin1(p, size, errors);
3233
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003234 /* allocate enough for a simple encoding without
3235 replacements, if we need more, we'll resize */
3236 res = PyString_FromStringAndSize(NULL, size);
3237 if (res == NULL)
3238 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00003239 if (size == 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003240 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003241
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003242 while (inpos<size) {
3243 /* try to encode it */
3244 PyObject *x = charmapencode_output(p[inpos], mapping, &res, &respos);
3245 if (x==NULL) /* error */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003246 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003247 if (x==Py_None) { /* unencodable character */
3248 if (charmap_encoding_error(p, size, &inpos, mapping,
3249 &exc,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00003250 &known_errorHandler, &errorHandler, errors,
Walter Dörwald9b30f202003-08-15 16:26:34 +00003251 &res, &respos)) {
3252 Py_DECREF(x);
Marc-André Lemburg3a645e42001-01-16 11:54:12 +00003253 goto onError;
Walter Dörwald9b30f202003-08-15 16:26:34 +00003254 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003255 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003256 else
3257 /* done with this character => adjust input position */
3258 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003259 Py_DECREF(x);
3260 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003261
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003262 /* Resize if we allocated to much */
3263 if (respos<PyString_GET_SIZE(res)) {
3264 if (_PyString_Resize(&res, respos))
3265 goto onError;
3266 }
3267 Py_XDECREF(exc);
3268 Py_XDECREF(errorHandler);
3269 return res;
3270
3271 onError:
3272 Py_XDECREF(res);
3273 Py_XDECREF(exc);
3274 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003275 return NULL;
3276}
3277
3278PyObject *PyUnicode_AsCharmapString(PyObject *unicode,
3279 PyObject *mapping)
3280{
3281 if (!PyUnicode_Check(unicode) || mapping == NULL) {
3282 PyErr_BadArgument();
3283 return NULL;
3284 }
3285 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
3286 PyUnicode_GET_SIZE(unicode),
3287 mapping,
3288 NULL);
3289}
3290
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003291/* create or adjust a UnicodeTranslateError */
3292static void make_translate_exception(PyObject **exceptionObject,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003293 const Py_UNICODE *unicode, Py_ssize_t size,
3294 Py_ssize_t startpos, Py_ssize_t endpos,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003295 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003296{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003297 if (*exceptionObject == NULL) {
3298 *exceptionObject = PyUnicodeTranslateError_Create(
3299 unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003300 }
3301 else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003302 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
3303 goto onError;
3304 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
3305 goto onError;
3306 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
3307 goto onError;
3308 return;
3309 onError:
3310 Py_DECREF(*exceptionObject);
3311 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003312 }
3313}
3314
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003315/* raises a UnicodeTranslateError */
3316static void raise_translate_exception(PyObject **exceptionObject,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003317 const Py_UNICODE *unicode, Py_ssize_t size,
3318 Py_ssize_t startpos, Py_ssize_t endpos,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003319 const char *reason)
3320{
3321 make_translate_exception(exceptionObject,
3322 unicode, size, startpos, endpos, reason);
3323 if (*exceptionObject != NULL)
3324 PyCodec_StrictErrors(*exceptionObject);
3325}
3326
3327/* error handling callback helper:
3328 build arguments, call the callback and check the arguments,
3329 put the result into newpos and return the replacement string, which
3330 has to be freed by the caller */
3331static PyObject *unicode_translate_call_errorhandler(const char *errors,
3332 PyObject **errorHandler,
3333 const char *reason,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003334 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
3335 Py_ssize_t startpos, Py_ssize_t endpos,
3336 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003337{
Martin v. Löwis412fb672006-04-13 06:34:32 +00003338 static char *argparse = "O!n;translating error handler must return (unicode, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003339
Martin v. Löwis412fb672006-04-13 06:34:32 +00003340 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003341 PyObject *restuple;
3342 PyObject *resunicode;
3343
3344 if (*errorHandler == NULL) {
3345 *errorHandler = PyCodec_LookupError(errors);
3346 if (*errorHandler == NULL)
3347 return NULL;
3348 }
3349
3350 make_translate_exception(exceptionObject,
3351 unicode, size, startpos, endpos, reason);
3352 if (*exceptionObject == NULL)
3353 return NULL;
3354
3355 restuple = PyObject_CallFunctionObjArgs(
3356 *errorHandler, *exceptionObject, NULL);
3357 if (restuple == NULL)
3358 return NULL;
3359 if (!PyTuple_Check(restuple)) {
3360 PyErr_Format(PyExc_TypeError, &argparse[4]);
3361 Py_DECREF(restuple);
3362 return NULL;
3363 }
3364 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003365 &resunicode, &i_newpos)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003366 Py_DECREF(restuple);
3367 return NULL;
3368 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00003369 if (i_newpos<0)
3370 *newpos = size+i_newpos;
3371 else
3372 *newpos = i_newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003373 if (*newpos<0 || *newpos>size) {
Martin v. Löwis2c95cc62006-02-16 06:54:25 +00003374 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003375 Py_DECREF(restuple);
3376 return NULL;
3377 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003378 Py_INCREF(resunicode);
3379 Py_DECREF(restuple);
3380 return resunicode;
3381}
3382
3383/* Lookup the character ch in the mapping and put the result in result,
3384 which must be decrefed by the caller.
3385 Return 0 on success, -1 on error */
3386static
3387int charmaptranslate_lookup(Py_UNICODE c, PyObject *mapping, PyObject **result)
3388{
3389 PyObject *w = PyInt_FromLong((long)c);
3390 PyObject *x;
3391
3392 if (w == NULL)
3393 return -1;
3394 x = PyObject_GetItem(mapping, w);
3395 Py_DECREF(w);
3396 if (x == NULL) {
3397 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
3398 /* No mapping found means: use 1:1 mapping. */
3399 PyErr_Clear();
3400 *result = NULL;
3401 return 0;
3402 } else
3403 return -1;
3404 }
3405 else if (x == Py_None) {
3406 *result = x;
3407 return 0;
3408 }
3409 else if (PyInt_Check(x)) {
3410 long value = PyInt_AS_LONG(x);
3411 long max = PyUnicode_GetMax();
3412 if (value < 0 || value > max) {
3413 PyErr_Format(PyExc_TypeError,
3414 "character mapping must be in range(0x%lx)", max+1);
3415 Py_DECREF(x);
3416 return -1;
3417 }
3418 *result = x;
3419 return 0;
3420 }
3421 else if (PyUnicode_Check(x)) {
3422 *result = x;
3423 return 0;
3424 }
3425 else {
3426 /* wrong return value */
3427 PyErr_SetString(PyExc_TypeError,
3428 "character mapping must return integer, None or unicode");
Walter Dörwald150523e2003-08-15 16:52:19 +00003429 Py_DECREF(x);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003430 return -1;
3431 }
3432}
3433/* ensure that *outobj is at least requiredsize characters long,
3434if not reallocate and adjust various state variables.
3435Return 0 on success, -1 on error */
3436static
Walter Dörwald4894c302003-10-24 14:25:28 +00003437int charmaptranslate_makespace(PyObject **outobj, Py_UNICODE **outp,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003438 Py_ssize_t requiredsize)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003439{
Martin v. Löwis18e16552006-02-15 17:27:45 +00003440 Py_ssize_t oldsize = PyUnicode_GET_SIZE(*outobj);
Walter Dörwald4894c302003-10-24 14:25:28 +00003441 if (requiredsize > oldsize) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003442 /* remember old output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003443 Py_ssize_t outpos = *outp-PyUnicode_AS_UNICODE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003444 /* exponentially overallocate to minimize reallocations */
Walter Dörwald4894c302003-10-24 14:25:28 +00003445 if (requiredsize < 2 * oldsize)
3446 requiredsize = 2 * oldsize;
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00003447 if (_PyUnicode_Resize(outobj, requiredsize) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003448 return -1;
3449 *outp = PyUnicode_AS_UNICODE(*outobj) + outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003450 }
3451 return 0;
3452}
3453/* lookup the character, put the result in the output string and adjust
3454 various state variables. Return a new reference to the object that
3455 was put in the output buffer in *result, or Py_None, if the mapping was
3456 undefined (in which case no character was written).
3457 The called must decref result.
3458 Return 0 on success, -1 on error. */
3459static
Walter Dörwald4894c302003-10-24 14:25:28 +00003460int charmaptranslate_output(const Py_UNICODE *startinp, const Py_UNICODE *curinp,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003461 Py_ssize_t insize, PyObject *mapping, PyObject **outobj, Py_UNICODE **outp,
Walter Dörwald4894c302003-10-24 14:25:28 +00003462 PyObject **res)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003463{
Walter Dörwald4894c302003-10-24 14:25:28 +00003464 if (charmaptranslate_lookup(*curinp, mapping, res))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003465 return -1;
3466 if (*res==NULL) {
3467 /* not found => default to 1:1 mapping */
Walter Dörwald4894c302003-10-24 14:25:28 +00003468 *(*outp)++ = *curinp;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003469 }
3470 else if (*res==Py_None)
3471 ;
3472 else if (PyInt_Check(*res)) {
3473 /* no overflow check, because we know that the space is enough */
3474 *(*outp)++ = (Py_UNICODE)PyInt_AS_LONG(*res);
3475 }
3476 else if (PyUnicode_Check(*res)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00003477 Py_ssize_t repsize = PyUnicode_GET_SIZE(*res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003478 if (repsize==1) {
3479 /* no overflow check, because we know that the space is enough */
3480 *(*outp)++ = *PyUnicode_AS_UNICODE(*res);
3481 }
3482 else if (repsize!=0) {
3483 /* more than one character */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003484 Py_ssize_t requiredsize = (*outp-PyUnicode_AS_UNICODE(*outobj)) +
Walter Dörwaldcd736e72004-02-05 17:36:00 +00003485 (insize - (curinp-startinp)) +
Walter Dörwald4894c302003-10-24 14:25:28 +00003486 repsize - 1;
3487 if (charmaptranslate_makespace(outobj, outp, requiredsize))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003488 return -1;
3489 memcpy(*outp, PyUnicode_AS_UNICODE(*res), sizeof(Py_UNICODE)*repsize);
3490 *outp += repsize;
3491 }
3492 }
3493 else
3494 return -1;
3495 return 0;
3496}
3497
3498PyObject *PyUnicode_TranslateCharmap(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003499 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003500 PyObject *mapping,
3501 const char *errors)
3502{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003503 /* output object */
3504 PyObject *res = NULL;
3505 /* pointers to the beginning and end+1 of input */
3506 const Py_UNICODE *startp = p;
3507 const Py_UNICODE *endp = p + size;
3508 /* pointer into the output */
3509 Py_UNICODE *str;
3510 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003511 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003512 char *reason = "character maps to <undefined>";
3513 PyObject *errorHandler = NULL;
3514 PyObject *exc = NULL;
3515 /* the following variable is used for caching string comparisons
3516 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
3517 * 3=ignore, 4=xmlcharrefreplace */
3518 int known_errorHandler = -1;
3519
Guido van Rossumd57fd912000-03-10 22:53:23 +00003520 if (mapping == NULL) {
3521 PyErr_BadArgument();
3522 return NULL;
3523 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003524
3525 /* allocate enough for a simple 1:1 translation without
3526 replacements, if we need more, we'll resize */
3527 res = PyUnicode_FromUnicode(NULL, size);
3528 if (res == NULL)
Walter Dörwald4894c302003-10-24 14:25:28 +00003529 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003530 if (size == 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003531 return res;
3532 str = PyUnicode_AS_UNICODE(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003533
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003534 while (p<endp) {
3535 /* try to encode it */
3536 PyObject *x = NULL;
Walter Dörwald4894c302003-10-24 14:25:28 +00003537 if (charmaptranslate_output(startp, p, size, mapping, &res, &str, &x)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003538 Py_XDECREF(x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003539 goto onError;
3540 }
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00003541 Py_XDECREF(x);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003542 if (x!=Py_None) /* it worked => adjust input pointer */
3543 ++p;
3544 else { /* untranslatable character */
3545 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003546 Py_ssize_t repsize;
3547 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003548 Py_UNICODE *uni2;
3549 /* startpos for collecting untranslatable chars */
3550 const Py_UNICODE *collstart = p;
3551 const Py_UNICODE *collend = p+1;
3552 const Py_UNICODE *coll;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003553
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003554 /* find all untranslatable characters */
3555 while (collend < endp) {
Walter Dörwald4894c302003-10-24 14:25:28 +00003556 if (charmaptranslate_lookup(*collend, mapping, &x))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003557 goto onError;
3558 Py_XDECREF(x);
3559 if (x!=Py_None)
3560 break;
3561 ++collend;
3562 }
3563 /* cache callback name lookup
3564 * (if not done yet, i.e. it's the first error) */
3565 if (known_errorHandler==-1) {
3566 if ((errors==NULL) || (!strcmp(errors, "strict")))
3567 known_errorHandler = 1;
3568 else if (!strcmp(errors, "replace"))
3569 known_errorHandler = 2;
3570 else if (!strcmp(errors, "ignore"))
3571 known_errorHandler = 3;
3572 else if (!strcmp(errors, "xmlcharrefreplace"))
3573 known_errorHandler = 4;
3574 else
3575 known_errorHandler = 0;
3576 }
3577 switch (known_errorHandler) {
3578 case 1: /* strict */
3579 raise_translate_exception(&exc, startp, size, collstart-startp, collend-startp, reason);
3580 goto onError;
3581 case 2: /* replace */
3582 /* No need to check for space, this is a 1:1 replacement */
3583 for (coll = collstart; coll<collend; ++coll)
3584 *str++ = '?';
3585 /* fall through */
3586 case 3: /* ignore */
3587 p = collend;
3588 break;
3589 case 4: /* xmlcharrefreplace */
3590 /* generate replacement (temporarily (mis)uses p) */
3591 for (p = collstart; p < collend; ++p) {
3592 char buffer[2+29+1+1];
3593 char *cp;
3594 sprintf(buffer, "&#%d;", (int)*p);
Walter Dörwald4894c302003-10-24 14:25:28 +00003595 if (charmaptranslate_makespace(&res, &str,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003596 (str-PyUnicode_AS_UNICODE(res))+strlen(buffer)+(endp-collend)))
3597 goto onError;
3598 for (cp = buffer; *cp; ++cp)
3599 *str++ = *cp;
3600 }
3601 p = collend;
3602 break;
3603 default:
3604 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
3605 reason, startp, size, &exc,
3606 collstart-startp, collend-startp, &newpos);
3607 if (repunicode == NULL)
3608 goto onError;
3609 /* generate replacement */
3610 repsize = PyUnicode_GET_SIZE(repunicode);
Walter Dörwald4894c302003-10-24 14:25:28 +00003611 if (charmaptranslate_makespace(&res, &str,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003612 (str-PyUnicode_AS_UNICODE(res))+repsize+(endp-collend))) {
3613 Py_DECREF(repunicode);
3614 goto onError;
3615 }
3616 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2)
3617 *str++ = *uni2;
3618 p = startp + newpos;
3619 Py_DECREF(repunicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003620 }
3621 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003622 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003623 /* Resize if we allocated to much */
3624 respos = str-PyUnicode_AS_UNICODE(res);
Walter Dörwald4894c302003-10-24 14:25:28 +00003625 if (respos<PyUnicode_GET_SIZE(res)) {
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00003626 if (_PyUnicode_Resize(&res, respos) < 0)
Guido van Rossumfd4b9572000-04-10 13:51:10 +00003627 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003628 }
3629 Py_XDECREF(exc);
3630 Py_XDECREF(errorHandler);
3631 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003632
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003633 onError:
3634 Py_XDECREF(res);
3635 Py_XDECREF(exc);
3636 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003637 return NULL;
3638}
3639
3640PyObject *PyUnicode_Translate(PyObject *str,
3641 PyObject *mapping,
3642 const char *errors)
3643{
3644 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00003645
Guido van Rossumd57fd912000-03-10 22:53:23 +00003646 str = PyUnicode_FromObject(str);
3647 if (str == NULL)
3648 goto onError;
3649 result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str),
3650 PyUnicode_GET_SIZE(str),
3651 mapping,
3652 errors);
3653 Py_DECREF(str);
3654 return result;
Tim Petersced69f82003-09-16 20:30:58 +00003655
Guido van Rossumd57fd912000-03-10 22:53:23 +00003656 onError:
3657 Py_XDECREF(str);
3658 return NULL;
3659}
Tim Petersced69f82003-09-16 20:30:58 +00003660
Guido van Rossum9e896b32000-04-05 20:11:21 +00003661/* --- Decimal Encoder ---------------------------------------------------- */
3662
3663int PyUnicode_EncodeDecimal(Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003664 Py_ssize_t length,
Guido van Rossum9e896b32000-04-05 20:11:21 +00003665 char *output,
3666 const char *errors)
3667{
3668 Py_UNICODE *p, *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003669 PyObject *errorHandler = NULL;
3670 PyObject *exc = NULL;
3671 const char *encoding = "decimal";
3672 const char *reason = "invalid decimal Unicode string";
3673 /* the following variable is used for caching string comparisons
3674 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
3675 int known_errorHandler = -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00003676
3677 if (output == NULL) {
3678 PyErr_BadArgument();
3679 return -1;
3680 }
3681
3682 p = s;
3683 end = s + length;
3684 while (p < end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003685 register Py_UNICODE ch = *p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00003686 int decimal;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003687 PyObject *repunicode;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003688 Py_ssize_t repsize;
3689 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003690 Py_UNICODE *uni2;
3691 Py_UNICODE *collstart;
3692 Py_UNICODE *collend;
Tim Petersced69f82003-09-16 20:30:58 +00003693
Guido van Rossum9e896b32000-04-05 20:11:21 +00003694 if (Py_UNICODE_ISSPACE(ch)) {
3695 *output++ = ' ';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003696 ++p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00003697 continue;
3698 }
3699 decimal = Py_UNICODE_TODECIMAL(ch);
3700 if (decimal >= 0) {
3701 *output++ = '0' + decimal;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003702 ++p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00003703 continue;
3704 }
Guido van Rossumba477042000-04-06 18:18:10 +00003705 if (0 < ch && ch < 256) {
Guido van Rossum42c29aa2000-05-03 23:58:29 +00003706 *output++ = (char)ch;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003707 ++p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00003708 continue;
3709 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003710 /* All other characters are considered unencodable */
3711 collstart = p;
3712 collend = p+1;
3713 while (collend < end) {
3714 if ((0 < *collend && *collend < 256) ||
3715 !Py_UNICODE_ISSPACE(*collend) ||
3716 Py_UNICODE_TODECIMAL(*collend))
3717 break;
Guido van Rossum9e896b32000-04-05 20:11:21 +00003718 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003719 /* cache callback name lookup
3720 * (if not done yet, i.e. it's the first error) */
3721 if (known_errorHandler==-1) {
3722 if ((errors==NULL) || (!strcmp(errors, "strict")))
3723 known_errorHandler = 1;
3724 else if (!strcmp(errors, "replace"))
3725 known_errorHandler = 2;
3726 else if (!strcmp(errors, "ignore"))
3727 known_errorHandler = 3;
3728 else if (!strcmp(errors, "xmlcharrefreplace"))
3729 known_errorHandler = 4;
3730 else
3731 known_errorHandler = 0;
3732 }
3733 switch (known_errorHandler) {
3734 case 1: /* strict */
3735 raise_encode_exception(&exc, encoding, s, length, collstart-s, collend-s, reason);
3736 goto onError;
3737 case 2: /* replace */
3738 for (p = collstart; p < collend; ++p)
3739 *output++ = '?';
3740 /* fall through */
3741 case 3: /* ignore */
3742 p = collend;
3743 break;
3744 case 4: /* xmlcharrefreplace */
3745 /* generate replacement (temporarily (mis)uses p) */
3746 for (p = collstart; p < collend; ++p)
3747 output += sprintf(output, "&#%d;", (int)*p);
3748 p = collend;
3749 break;
3750 default:
3751 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
3752 encoding, reason, s, length, &exc,
3753 collstart-s, collend-s, &newpos);
3754 if (repunicode == NULL)
3755 goto onError;
3756 /* generate replacement */
3757 repsize = PyUnicode_GET_SIZE(repunicode);
3758 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
3759 Py_UNICODE ch = *uni2;
3760 if (Py_UNICODE_ISSPACE(ch))
3761 *output++ = ' ';
3762 else {
3763 decimal = Py_UNICODE_TODECIMAL(ch);
3764 if (decimal >= 0)
3765 *output++ = '0' + decimal;
3766 else if (0 < ch && ch < 256)
3767 *output++ = (char)ch;
3768 else {
3769 Py_DECREF(repunicode);
3770 raise_encode_exception(&exc, encoding,
3771 s, length, collstart-s, collend-s, reason);
3772 goto onError;
3773 }
3774 }
3775 }
3776 p = s + newpos;
3777 Py_DECREF(repunicode);
Guido van Rossum9e896b32000-04-05 20:11:21 +00003778 }
3779 }
3780 /* 0-terminate the output string */
3781 *output++ = '\0';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003782 Py_XDECREF(exc);
3783 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00003784 return 0;
3785
3786 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003787 Py_XDECREF(exc);
3788 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00003789 return -1;
3790}
3791
Guido van Rossumd57fd912000-03-10 22:53:23 +00003792/* --- Helpers ------------------------------------------------------------ */
3793
Tim Petersced69f82003-09-16 20:30:58 +00003794static
Martin v. Löwis18e16552006-02-15 17:27:45 +00003795Py_ssize_t count(PyUnicodeObject *self,
3796 Py_ssize_t start,
3797 Py_ssize_t end,
3798 PyUnicodeObject *substring)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003799{
Martin v. Löwis412fb672006-04-13 06:34:32 +00003800 Py_ssize_t count = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003801
Marc-André Lemburg3a645e42001-01-16 11:54:12 +00003802 if (start < 0)
3803 start += self->length;
3804 if (start < 0)
3805 start = 0;
3806 if (end > self->length)
3807 end = self->length;
3808 if (end < 0)
3809 end += self->length;
3810 if (end < 0)
3811 end = 0;
3812
Marc-André Lemburg49ef6dc2000-06-18 22:25:22 +00003813 if (substring->length == 0)
3814 return (end - start + 1);
3815
Guido van Rossumd57fd912000-03-10 22:53:23 +00003816 end -= substring->length;
3817
3818 while (start <= end)
3819 if (Py_UNICODE_MATCH(self, start, substring)) {
3820 count++;
3821 start += substring->length;
3822 } else
3823 start++;
3824
3825 return count;
3826}
3827
Martin v. Löwis18e16552006-02-15 17:27:45 +00003828Py_ssize_t PyUnicode_Count(PyObject *str,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003829 PyObject *substr,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003830 Py_ssize_t start,
3831 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003832{
Martin v. Löwis18e16552006-02-15 17:27:45 +00003833 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00003834
Guido van Rossumd57fd912000-03-10 22:53:23 +00003835 str = PyUnicode_FromObject(str);
3836 if (str == NULL)
3837 return -1;
3838 substr = PyUnicode_FromObject(substr);
3839 if (substr == NULL) {
Marc-André Lemburg49ef6dc2000-06-18 22:25:22 +00003840 Py_DECREF(str);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003841 return -1;
3842 }
Tim Petersced69f82003-09-16 20:30:58 +00003843
Guido van Rossumd57fd912000-03-10 22:53:23 +00003844 result = count((PyUnicodeObject *)str,
3845 start, end,
3846 (PyUnicodeObject *)substr);
Tim Petersced69f82003-09-16 20:30:58 +00003847
Guido van Rossumd57fd912000-03-10 22:53:23 +00003848 Py_DECREF(str);
3849 Py_DECREF(substr);
3850 return result;
3851}
3852
Tim Petersced69f82003-09-16 20:30:58 +00003853static
Martin v. Löwis18e16552006-02-15 17:27:45 +00003854Py_ssize_t findstring(PyUnicodeObject *self,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003855 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003856 Py_ssize_t start,
3857 Py_ssize_t end,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003858 int direction)
3859{
3860 if (start < 0)
3861 start += self->length;
3862 if (start < 0)
3863 start = 0;
3864
Guido van Rossumd57fd912000-03-10 22:53:23 +00003865 if (end > self->length)
3866 end = self->length;
3867 if (end < 0)
3868 end += self->length;
3869 if (end < 0)
3870 end = 0;
3871
Guido van Rossum76afbd92002-08-20 17:29:29 +00003872 if (substring->length == 0)
3873 return (direction > 0) ? start : end;
3874
Guido van Rossumd57fd912000-03-10 22:53:23 +00003875 end -= substring->length;
3876
3877 if (direction < 0) {
3878 for (; end >= start; end--)
3879 if (Py_UNICODE_MATCH(self, end, substring))
3880 return end;
3881 } else {
3882 for (; start <= end; start++)
3883 if (Py_UNICODE_MATCH(self, start, substring))
3884 return start;
3885 }
3886
3887 return -1;
3888}
3889
Martin v. Löwis18e16552006-02-15 17:27:45 +00003890Py_ssize_t PyUnicode_Find(PyObject *str,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003891 PyObject *substr,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003892 Py_ssize_t start,
3893 Py_ssize_t end,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003894 int direction)
3895{
Martin v. Löwis18e16552006-02-15 17:27:45 +00003896 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00003897
Guido van Rossumd57fd912000-03-10 22:53:23 +00003898 str = PyUnicode_FromObject(str);
3899 if (str == NULL)
Marc-André Lemburg4da6fd62002-05-29 11:33:13 +00003900 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003901 substr = PyUnicode_FromObject(substr);
3902 if (substr == NULL) {
Marc-André Lemburg41644392002-05-29 13:46:29 +00003903 Py_DECREF(str);
Marc-André Lemburg4da6fd62002-05-29 11:33:13 +00003904 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003905 }
Tim Petersced69f82003-09-16 20:30:58 +00003906
Guido van Rossumd57fd912000-03-10 22:53:23 +00003907 result = findstring((PyUnicodeObject *)str,
3908 (PyUnicodeObject *)substr,
3909 start, end, direction);
3910 Py_DECREF(str);
3911 Py_DECREF(substr);
3912 return result;
3913}
3914
Tim Petersced69f82003-09-16 20:30:58 +00003915static
Guido van Rossumd57fd912000-03-10 22:53:23 +00003916int tailmatch(PyUnicodeObject *self,
3917 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003918 Py_ssize_t start,
3919 Py_ssize_t end,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003920 int direction)
3921{
3922 if (start < 0)
3923 start += self->length;
3924 if (start < 0)
3925 start = 0;
3926
3927 if (substring->length == 0)
3928 return 1;
3929
3930 if (end > self->length)
3931 end = self->length;
3932 if (end < 0)
3933 end += self->length;
3934 if (end < 0)
3935 end = 0;
3936
3937 end -= substring->length;
3938 if (end < start)
3939 return 0;
3940
3941 if (direction > 0) {
3942 if (Py_UNICODE_MATCH(self, end, substring))
3943 return 1;
3944 } else {
3945 if (Py_UNICODE_MATCH(self, start, substring))
3946 return 1;
3947 }
3948
3949 return 0;
3950}
3951
Martin v. Löwis18e16552006-02-15 17:27:45 +00003952Py_ssize_t PyUnicode_Tailmatch(PyObject *str,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003953 PyObject *substr,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003954 Py_ssize_t start,
3955 Py_ssize_t end,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003956 int direction)
3957{
Martin v. Löwis18e16552006-02-15 17:27:45 +00003958 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00003959
Guido van Rossumd57fd912000-03-10 22:53:23 +00003960 str = PyUnicode_FromObject(str);
3961 if (str == NULL)
3962 return -1;
3963 substr = PyUnicode_FromObject(substr);
3964 if (substr == NULL) {
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00003965 Py_DECREF(str);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003966 return -1;
3967 }
Tim Petersced69f82003-09-16 20:30:58 +00003968
Guido van Rossumd57fd912000-03-10 22:53:23 +00003969 result = tailmatch((PyUnicodeObject *)str,
3970 (PyUnicodeObject *)substr,
3971 start, end, direction);
3972 Py_DECREF(str);
3973 Py_DECREF(substr);
3974 return result;
3975}
3976
Tim Petersced69f82003-09-16 20:30:58 +00003977static
Guido van Rossumd57fd912000-03-10 22:53:23 +00003978const Py_UNICODE *findchar(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003979 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003980 Py_UNICODE ch)
3981{
3982 /* like wcschr, but doesn't stop at NULL characters */
3983
3984 while (size-- > 0) {
3985 if (*s == ch)
3986 return s;
3987 s++;
3988 }
3989
3990 return NULL;
3991}
3992
3993/* Apply fixfct filter to the Unicode object self and return a
3994 reference to the modified object */
3995
Tim Petersced69f82003-09-16 20:30:58 +00003996static
Guido van Rossumd57fd912000-03-10 22:53:23 +00003997PyObject *fixup(PyUnicodeObject *self,
3998 int (*fixfct)(PyUnicodeObject *s))
3999{
4000
4001 PyUnicodeObject *u;
4002
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004003 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004004 if (u == NULL)
4005 return NULL;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004006
4007 Py_UNICODE_COPY(u->str, self->str, self->length);
4008
Tim Peters7a29bd52001-09-12 03:03:31 +00004009 if (!fixfct(u) && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004010 /* fixfct should return TRUE if it modified the buffer. If
4011 FALSE, return a reference to the original buffer instead
4012 (to save space, not time) */
4013 Py_INCREF(self);
4014 Py_DECREF(u);
4015 return (PyObject*) self;
4016 }
4017 return (PyObject*) u;
4018}
4019
Tim Petersced69f82003-09-16 20:30:58 +00004020static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004021int fixupper(PyUnicodeObject *self)
4022{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004023 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004024 Py_UNICODE *s = self->str;
4025 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00004026
Guido van Rossumd57fd912000-03-10 22:53:23 +00004027 while (len-- > 0) {
4028 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00004029
Guido van Rossumd57fd912000-03-10 22:53:23 +00004030 ch = Py_UNICODE_TOUPPER(*s);
4031 if (ch != *s) {
4032 status = 1;
4033 *s = ch;
4034 }
4035 s++;
4036 }
4037
4038 return status;
4039}
4040
Tim Petersced69f82003-09-16 20:30:58 +00004041static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004042int fixlower(PyUnicodeObject *self)
4043{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004044 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004045 Py_UNICODE *s = self->str;
4046 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00004047
Guido van Rossumd57fd912000-03-10 22:53:23 +00004048 while (len-- > 0) {
4049 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00004050
Guido van Rossumd57fd912000-03-10 22:53:23 +00004051 ch = Py_UNICODE_TOLOWER(*s);
4052 if (ch != *s) {
4053 status = 1;
4054 *s = ch;
4055 }
4056 s++;
4057 }
4058
4059 return status;
4060}
4061
Tim Petersced69f82003-09-16 20:30:58 +00004062static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004063int fixswapcase(PyUnicodeObject *self)
4064{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004065 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004066 Py_UNICODE *s = self->str;
4067 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00004068
Guido van Rossumd57fd912000-03-10 22:53:23 +00004069 while (len-- > 0) {
4070 if (Py_UNICODE_ISUPPER(*s)) {
4071 *s = Py_UNICODE_TOLOWER(*s);
4072 status = 1;
4073 } else if (Py_UNICODE_ISLOWER(*s)) {
4074 *s = Py_UNICODE_TOUPPER(*s);
4075 status = 1;
4076 }
4077 s++;
4078 }
4079
4080 return status;
4081}
4082
Tim Petersced69f82003-09-16 20:30:58 +00004083static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004084int fixcapitalize(PyUnicodeObject *self)
4085{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004086 Py_ssize_t len = self->length;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00004087 Py_UNICODE *s = self->str;
4088 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00004089
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00004090 if (len == 0)
4091 return 0;
4092 if (Py_UNICODE_ISLOWER(*s)) {
4093 *s = Py_UNICODE_TOUPPER(*s);
4094 status = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004095 }
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00004096 s++;
4097 while (--len > 0) {
4098 if (Py_UNICODE_ISUPPER(*s)) {
4099 *s = Py_UNICODE_TOLOWER(*s);
4100 status = 1;
4101 }
4102 s++;
4103 }
4104 return status;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004105}
4106
4107static
4108int fixtitle(PyUnicodeObject *self)
4109{
4110 register Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
4111 register Py_UNICODE *e;
4112 int previous_is_cased;
4113
4114 /* Shortcut for single character strings */
4115 if (PyUnicode_GET_SIZE(self) == 1) {
4116 Py_UNICODE ch = Py_UNICODE_TOTITLE(*p);
4117 if (*p != ch) {
4118 *p = ch;
4119 return 1;
4120 }
4121 else
4122 return 0;
4123 }
Tim Petersced69f82003-09-16 20:30:58 +00004124
Guido van Rossumd57fd912000-03-10 22:53:23 +00004125 e = p + PyUnicode_GET_SIZE(self);
4126 previous_is_cased = 0;
4127 for (; p < e; p++) {
4128 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00004129
Guido van Rossumd57fd912000-03-10 22:53:23 +00004130 if (previous_is_cased)
4131 *p = Py_UNICODE_TOLOWER(ch);
4132 else
4133 *p = Py_UNICODE_TOTITLE(ch);
Tim Petersced69f82003-09-16 20:30:58 +00004134
4135 if (Py_UNICODE_ISLOWER(ch) ||
4136 Py_UNICODE_ISUPPER(ch) ||
Guido van Rossumd57fd912000-03-10 22:53:23 +00004137 Py_UNICODE_ISTITLE(ch))
4138 previous_is_cased = 1;
4139 else
4140 previous_is_cased = 0;
4141 }
4142 return 1;
4143}
4144
Tim Peters8ce9f162004-08-27 01:49:32 +00004145PyObject *
4146PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004147{
Tim Peters8ce9f162004-08-27 01:49:32 +00004148 PyObject *internal_separator = NULL;
Skip Montanaro6543b452004-09-16 03:28:13 +00004149 const Py_UNICODE blank = ' ';
4150 const Py_UNICODE *sep = &blank;
Tim Peters286085c2006-05-22 19:17:04 +00004151 Py_ssize_t seplen = 1;
Tim Peters05eba1f2004-08-27 21:32:02 +00004152 PyUnicodeObject *res = NULL; /* the result */
Tim Peters286085c2006-05-22 19:17:04 +00004153 Py_ssize_t res_alloc = 100; /* # allocated bytes for string in res */
4154 Py_ssize_t res_used; /* # used bytes */
Tim Peters05eba1f2004-08-27 21:32:02 +00004155 Py_UNICODE *res_p; /* pointer to free byte in res's string area */
4156 PyObject *fseq; /* PySequence_Fast(seq) */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004157 Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */
Tim Peters8ce9f162004-08-27 01:49:32 +00004158 PyObject *item;
Martin v. Löwis412fb672006-04-13 06:34:32 +00004159 Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004160
Tim Peters05eba1f2004-08-27 21:32:02 +00004161 fseq = PySequence_Fast(seq, "");
4162 if (fseq == NULL) {
Tim Peters05eba1f2004-08-27 21:32:02 +00004163 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +00004164 }
4165
Tim Peters91879ab2004-08-27 22:35:44 +00004166 /* Grrrr. A codec may be invoked to convert str objects to
4167 * Unicode, and so it's possible to call back into Python code
4168 * during PyUnicode_FromObject(), and so it's possible for a sick
4169 * codec to change the size of fseq (if seq is a list). Therefore
4170 * we have to keep refetching the size -- can't assume seqlen
4171 * is invariant.
4172 */
Tim Peters05eba1f2004-08-27 21:32:02 +00004173 seqlen = PySequence_Fast_GET_SIZE(fseq);
4174 /* If empty sequence, return u"". */
4175 if (seqlen == 0) {
4176 res = _PyUnicode_New(0); /* empty sequence; return u"" */
4177 goto Done;
4178 }
4179 /* If singleton sequence with an exact Unicode, return that. */
4180 if (seqlen == 1) {
4181 item = PySequence_Fast_GET_ITEM(fseq, 0);
4182 if (PyUnicode_CheckExact(item)) {
4183 Py_INCREF(item);
4184 res = (PyUnicodeObject *)item;
4185 goto Done;
4186 }
Tim Peters8ce9f162004-08-27 01:49:32 +00004187 }
4188
Tim Peters05eba1f2004-08-27 21:32:02 +00004189 /* At least two items to join, or one that isn't exact Unicode. */
4190 if (seqlen > 1) {
4191 /* Set up sep and seplen -- they're needed. */
4192 if (separator == NULL) {
4193 sep = &blank;
4194 seplen = 1;
4195 }
4196 else {
4197 internal_separator = PyUnicode_FromObject(separator);
4198 if (internal_separator == NULL)
4199 goto onError;
4200 sep = PyUnicode_AS_UNICODE(internal_separator);
4201 seplen = PyUnicode_GET_SIZE(internal_separator);
Tim Peters91879ab2004-08-27 22:35:44 +00004202 /* In case PyUnicode_FromObject() mutated seq. */
4203 seqlen = PySequence_Fast_GET_SIZE(fseq);
Tim Peters05eba1f2004-08-27 21:32:02 +00004204 }
4205 }
4206
4207 /* Get space. */
Martin v. Löwis412fb672006-04-13 06:34:32 +00004208 res = _PyUnicode_New(res_alloc);
Tim Peters05eba1f2004-08-27 21:32:02 +00004209 if (res == NULL)
Tim Peters8ce9f162004-08-27 01:49:32 +00004210 goto onError;
Tim Peters05eba1f2004-08-27 21:32:02 +00004211 res_p = PyUnicode_AS_UNICODE(res);
4212 res_used = 0;
Tim Peters8ce9f162004-08-27 01:49:32 +00004213
Tim Peters05eba1f2004-08-27 21:32:02 +00004214 for (i = 0; i < seqlen; ++i) {
Tim Peters286085c2006-05-22 19:17:04 +00004215 Py_ssize_t itemlen;
4216 Py_ssize_t new_res_used;
Tim Peters05eba1f2004-08-27 21:32:02 +00004217
4218 item = PySequence_Fast_GET_ITEM(fseq, i);
4219 /* Convert item to Unicode. */
4220 if (! PyUnicode_Check(item) && ! PyString_Check(item)) {
4221 PyErr_Format(PyExc_TypeError,
Thomas Wouters715a4cd2006-04-16 22:04:49 +00004222 "sequence item %zd: expected string or Unicode,"
Tim Peters05eba1f2004-08-27 21:32:02 +00004223 " %.80s found",
4224 i, item->ob_type->tp_name);
Tim Peters2cfe3682001-05-05 05:36:48 +00004225 goto onError;
Tim Peters8ce9f162004-08-27 01:49:32 +00004226 }
Tim Peters05eba1f2004-08-27 21:32:02 +00004227 item = PyUnicode_FromObject(item);
4228 if (item == NULL)
4229 goto onError;
4230 /* We own a reference to item from here on. */
Tim Petersced69f82003-09-16 20:30:58 +00004231
Tim Peters91879ab2004-08-27 22:35:44 +00004232 /* In case PyUnicode_FromObject() mutated seq. */
4233 seqlen = PySequence_Fast_GET_SIZE(fseq);
4234
Tim Peters8ce9f162004-08-27 01:49:32 +00004235 /* Make sure we have enough space for the separator and the item. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004236 itemlen = PyUnicode_GET_SIZE(item);
Tim Peters05eba1f2004-08-27 21:32:02 +00004237 new_res_used = res_used + itemlen;
Tim Peters286085c2006-05-22 19:17:04 +00004238 if (new_res_used <= 0)
Tim Peters8ce9f162004-08-27 01:49:32 +00004239 goto Overflow;
Tim Peters05eba1f2004-08-27 21:32:02 +00004240 if (i < seqlen - 1) {
4241 new_res_used += seplen;
Tim Peters286085c2006-05-22 19:17:04 +00004242 if (new_res_used <= 0)
Tim Peters05eba1f2004-08-27 21:32:02 +00004243 goto Overflow;
4244 }
4245 if (new_res_used > res_alloc) {
4246 /* double allocated size until it's big enough */
Tim Peters8ce9f162004-08-27 01:49:32 +00004247 do {
Tim Peters05eba1f2004-08-27 21:32:02 +00004248 res_alloc += res_alloc;
Tim Peters286085c2006-05-22 19:17:04 +00004249 if (res_alloc <= 0)
Tim Peters8ce9f162004-08-27 01:49:32 +00004250 goto Overflow;
Tim Peters05eba1f2004-08-27 21:32:02 +00004251 } while (new_res_used > res_alloc);
Martin v. Löwis412fb672006-04-13 06:34:32 +00004252 if (_PyUnicode_Resize(&res, res_alloc) < 0) {
Marc-André Lemburg3508e302001-09-20 17:22:58 +00004253 Py_DECREF(item);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004254 goto onError;
Marc-André Lemburg3508e302001-09-20 17:22:58 +00004255 }
Tim Peters05eba1f2004-08-27 21:32:02 +00004256 res_p = PyUnicode_AS_UNICODE(res) + res_used;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004257 }
Tim Peters05eba1f2004-08-27 21:32:02 +00004258
4259 /* Copy item, and maybe the separator. */
Martin v. Löwis412fb672006-04-13 06:34:32 +00004260 Py_UNICODE_COPY(res_p, PyUnicode_AS_UNICODE(item), itemlen);
Tim Peters05eba1f2004-08-27 21:32:02 +00004261 res_p += itemlen;
4262 if (i < seqlen - 1) {
Martin v. Löwis412fb672006-04-13 06:34:32 +00004263 Py_UNICODE_COPY(res_p, sep, seplen);
Tim Peters05eba1f2004-08-27 21:32:02 +00004264 res_p += seplen;
4265 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004266 Py_DECREF(item);
Tim Peters05eba1f2004-08-27 21:32:02 +00004267 res_used = new_res_used;
4268 }
Tim Peters8ce9f162004-08-27 01:49:32 +00004269
Tim Peters05eba1f2004-08-27 21:32:02 +00004270 /* Shrink res to match the used area; this probably can't fail,
4271 * but it's cheap to check.
4272 */
Martin v. Löwis412fb672006-04-13 06:34:32 +00004273 if (_PyUnicode_Resize(&res, res_used) < 0)
Tim Peters8ce9f162004-08-27 01:49:32 +00004274 goto onError;
4275
4276 Done:
4277 Py_XDECREF(internal_separator);
Tim Peters05eba1f2004-08-27 21:32:02 +00004278 Py_DECREF(fseq);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004279 return (PyObject *)res;
4280
Tim Peters8ce9f162004-08-27 01:49:32 +00004281 Overflow:
4282 PyErr_SetString(PyExc_OverflowError,
4283 "join() is too long for a Python string");
4284 Py_DECREF(item);
4285 /* fall through */
4286
Guido van Rossumd57fd912000-03-10 22:53:23 +00004287 onError:
Tim Peters8ce9f162004-08-27 01:49:32 +00004288 Py_XDECREF(internal_separator);
Tim Peters05eba1f2004-08-27 21:32:02 +00004289 Py_DECREF(fseq);
Tim Peters8ce9f162004-08-27 01:49:32 +00004290 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004291 return NULL;
4292}
4293
Tim Petersced69f82003-09-16 20:30:58 +00004294static
4295PyUnicodeObject *pad(PyUnicodeObject *self,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004296 Py_ssize_t left,
4297 Py_ssize_t right,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004298 Py_UNICODE fill)
4299{
4300 PyUnicodeObject *u;
4301
4302 if (left < 0)
4303 left = 0;
4304 if (right < 0)
4305 right = 0;
4306
Tim Peters7a29bd52001-09-12 03:03:31 +00004307 if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004308 Py_INCREF(self);
4309 return self;
4310 }
4311
4312 u = _PyUnicode_New(left + self->length + right);
4313 if (u) {
4314 if (left)
4315 Py_UNICODE_FILL(u->str, fill, left);
4316 Py_UNICODE_COPY(u->str + left, self->str, self->length);
4317 if (right)
4318 Py_UNICODE_FILL(u->str + left + self->length, fill, right);
4319 }
4320
4321 return u;
4322}
4323
4324#define SPLIT_APPEND(data, left, right) \
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004325 str = PyUnicode_FromUnicode((data) + (left), (right) - (left)); \
Guido van Rossumd57fd912000-03-10 22:53:23 +00004326 if (!str) \
4327 goto onError; \
4328 if (PyList_Append(list, str)) { \
4329 Py_DECREF(str); \
4330 goto onError; \
4331 } \
4332 else \
4333 Py_DECREF(str);
4334
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004335#define SPLIT_INSERT(data, left, right) \
4336 str = PyUnicode_FromUnicode((data) + (left), (right) - (left)); \
4337 if (!str) \
4338 goto onError; \
4339 if (PyList_Insert(list, 0, str)) { \
4340 Py_DECREF(str); \
4341 goto onError; \
4342 } \
4343 else \
4344 Py_DECREF(str);
4345
Guido van Rossumd57fd912000-03-10 22:53:23 +00004346static
4347PyObject *split_whitespace(PyUnicodeObject *self,
4348 PyObject *list,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004349 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004350{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004351 register Py_ssize_t i;
4352 register Py_ssize_t j;
4353 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004354 PyObject *str;
4355
4356 for (i = j = 0; i < len; ) {
4357 /* find a token */
4358 while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
4359 i++;
4360 j = i;
4361 while (i < len && !Py_UNICODE_ISSPACE(self->str[i]))
4362 i++;
4363 if (j < i) {
4364 if (maxcount-- <= 0)
4365 break;
4366 SPLIT_APPEND(self->str, j, i);
4367 while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
4368 i++;
4369 j = i;
4370 }
4371 }
4372 if (j < len) {
4373 SPLIT_APPEND(self->str, j, len);
4374 }
4375 return list;
4376
4377 onError:
4378 Py_DECREF(list);
4379 return NULL;
4380}
4381
4382PyObject *PyUnicode_Splitlines(PyObject *string,
Guido van Rossum86662912000-04-11 15:38:46 +00004383 int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004384{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004385 register Py_ssize_t i;
4386 register Py_ssize_t j;
4387 Py_ssize_t len;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004388 PyObject *list;
4389 PyObject *str;
4390 Py_UNICODE *data;
4391
4392 string = PyUnicode_FromObject(string);
4393 if (string == NULL)
4394 return NULL;
4395 data = PyUnicode_AS_UNICODE(string);
4396 len = PyUnicode_GET_SIZE(string);
4397
Guido van Rossumd57fd912000-03-10 22:53:23 +00004398 list = PyList_New(0);
4399 if (!list)
4400 goto onError;
4401
4402 for (i = j = 0; i < len; ) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00004403 Py_ssize_t eol;
Tim Petersced69f82003-09-16 20:30:58 +00004404
Guido van Rossumd57fd912000-03-10 22:53:23 +00004405 /* Find a line and append it */
4406 while (i < len && !Py_UNICODE_ISLINEBREAK(data[i]))
4407 i++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004408
4409 /* Skip the line break reading CRLF as one line break */
Guido van Rossum86662912000-04-11 15:38:46 +00004410 eol = i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004411 if (i < len) {
4412 if (data[i] == '\r' && i + 1 < len &&
4413 data[i+1] == '\n')
4414 i += 2;
4415 else
4416 i++;
Guido van Rossum86662912000-04-11 15:38:46 +00004417 if (keepends)
4418 eol = i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004419 }
Guido van Rossum86662912000-04-11 15:38:46 +00004420 SPLIT_APPEND(data, j, eol);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004421 j = i;
4422 }
4423 if (j < len) {
4424 SPLIT_APPEND(data, j, len);
4425 }
4426
4427 Py_DECREF(string);
4428 return list;
4429
4430 onError:
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00004431 Py_XDECREF(list);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004432 Py_DECREF(string);
4433 return NULL;
4434}
4435
Tim Petersced69f82003-09-16 20:30:58 +00004436static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004437PyObject *split_char(PyUnicodeObject *self,
4438 PyObject *list,
4439 Py_UNICODE ch,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004440 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004441{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004442 register Py_ssize_t i;
4443 register Py_ssize_t j;
4444 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004445 PyObject *str;
4446
4447 for (i = j = 0; i < len; ) {
4448 if (self->str[i] == ch) {
4449 if (maxcount-- <= 0)
4450 break;
4451 SPLIT_APPEND(self->str, j, i);
4452 i = j = i + 1;
4453 } else
4454 i++;
4455 }
4456 if (j <= len) {
4457 SPLIT_APPEND(self->str, j, len);
4458 }
4459 return list;
4460
4461 onError:
4462 Py_DECREF(list);
4463 return NULL;
4464}
4465
Tim Petersced69f82003-09-16 20:30:58 +00004466static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004467PyObject *split_substring(PyUnicodeObject *self,
4468 PyObject *list,
4469 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004470 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004471{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004472 register Py_ssize_t i;
4473 register Py_ssize_t j;
4474 Py_ssize_t len = self->length;
4475 Py_ssize_t sublen = substring->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004476 PyObject *str;
4477
Guido van Rossumcda4f9a2000-12-19 02:23:19 +00004478 for (i = j = 0; i <= len - sublen; ) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004479 if (Py_UNICODE_MATCH(self, i, substring)) {
4480 if (maxcount-- <= 0)
4481 break;
4482 SPLIT_APPEND(self->str, j, i);
4483 i = j = i + sublen;
4484 } else
4485 i++;
4486 }
4487 if (j <= len) {
4488 SPLIT_APPEND(self->str, j, len);
4489 }
4490 return list;
4491
4492 onError:
4493 Py_DECREF(list);
4494 return NULL;
4495}
4496
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004497static
4498PyObject *rsplit_whitespace(PyUnicodeObject *self,
4499 PyObject *list,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004500 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004501{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004502 register Py_ssize_t i;
4503 register Py_ssize_t j;
4504 Py_ssize_t len = self->length;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004505 PyObject *str;
4506
4507 for (i = j = len - 1; i >= 0; ) {
4508 /* find a token */
4509 while (i >= 0 && Py_UNICODE_ISSPACE(self->str[i]))
4510 i--;
4511 j = i;
4512 while (i >= 0 && !Py_UNICODE_ISSPACE(self->str[i]))
4513 i--;
4514 if (j > i) {
4515 if (maxcount-- <= 0)
4516 break;
4517 SPLIT_INSERT(self->str, i + 1, j + 1);
4518 while (i >= 0 && Py_UNICODE_ISSPACE(self->str[i]))
4519 i--;
4520 j = i;
4521 }
4522 }
4523 if (j >= 0) {
4524 SPLIT_INSERT(self->str, 0, j + 1);
4525 }
4526 return list;
4527
4528 onError:
4529 Py_DECREF(list);
4530 return NULL;
4531}
4532
4533static
4534PyObject *rsplit_char(PyUnicodeObject *self,
4535 PyObject *list,
4536 Py_UNICODE ch,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004537 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004538{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004539 register Py_ssize_t i;
4540 register Py_ssize_t j;
4541 Py_ssize_t len = self->length;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004542 PyObject *str;
4543
4544 for (i = j = len - 1; i >= 0; ) {
4545 if (self->str[i] == ch) {
4546 if (maxcount-- <= 0)
4547 break;
4548 SPLIT_INSERT(self->str, i + 1, j + 1);
4549 j = i = i - 1;
4550 } else
4551 i--;
4552 }
Hye-Shik Chang7fc4cf52003-12-23 09:10:16 +00004553 if (j >= -1) {
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004554 SPLIT_INSERT(self->str, 0, j + 1);
4555 }
4556 return list;
4557
4558 onError:
4559 Py_DECREF(list);
4560 return NULL;
4561}
4562
4563static
4564PyObject *rsplit_substring(PyUnicodeObject *self,
4565 PyObject *list,
4566 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004567 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004568{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004569 register Py_ssize_t i;
4570 register Py_ssize_t j;
4571 Py_ssize_t len = self->length;
4572 Py_ssize_t sublen = substring->length;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004573 PyObject *str;
4574
4575 for (i = len - sublen, j = len; i >= 0; ) {
4576 if (Py_UNICODE_MATCH(self, i, substring)) {
4577 if (maxcount-- <= 0)
4578 break;
4579 SPLIT_INSERT(self->str, i + sublen, j);
4580 j = i;
4581 i -= sublen;
4582 } else
4583 i--;
4584 }
4585 if (j >= 0) {
4586 SPLIT_INSERT(self->str, 0, j);
4587 }
4588 return list;
4589
4590 onError:
4591 Py_DECREF(list);
4592 return NULL;
4593}
4594
Guido van Rossumd57fd912000-03-10 22:53:23 +00004595#undef SPLIT_APPEND
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004596#undef SPLIT_INSERT
Guido van Rossumd57fd912000-03-10 22:53:23 +00004597
4598static
4599PyObject *split(PyUnicodeObject *self,
4600 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004601 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004602{
4603 PyObject *list;
4604
4605 if (maxcount < 0)
Martin v. Löwis412fb672006-04-13 06:34:32 +00004606 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004607
4608 list = PyList_New(0);
4609 if (!list)
4610 return NULL;
4611
4612 if (substring == NULL)
4613 return split_whitespace(self,list,maxcount);
4614
4615 else if (substring->length == 1)
4616 return split_char(self,list,substring->str[0],maxcount);
4617
4618 else if (substring->length == 0) {
4619 Py_DECREF(list);
4620 PyErr_SetString(PyExc_ValueError, "empty separator");
4621 return NULL;
4622 }
4623 else
4624 return split_substring(self,list,substring,maxcount);
4625}
4626
Tim Petersced69f82003-09-16 20:30:58 +00004627static
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004628PyObject *rsplit(PyUnicodeObject *self,
4629 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004630 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004631{
4632 PyObject *list;
4633
4634 if (maxcount < 0)
Martin v. Löwis412fb672006-04-13 06:34:32 +00004635 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004636
4637 list = PyList_New(0);
4638 if (!list)
4639 return NULL;
4640
4641 if (substring == NULL)
4642 return rsplit_whitespace(self,list,maxcount);
4643
4644 else if (substring->length == 1)
4645 return rsplit_char(self,list,substring->str[0],maxcount);
4646
4647 else if (substring->length == 0) {
4648 Py_DECREF(list);
4649 PyErr_SetString(PyExc_ValueError, "empty separator");
4650 return NULL;
4651 }
4652 else
4653 return rsplit_substring(self,list,substring,maxcount);
4654}
4655
4656static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004657PyObject *replace(PyUnicodeObject *self,
4658 PyUnicodeObject *str1,
4659 PyUnicodeObject *str2,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004660 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004661{
4662 PyUnicodeObject *u;
4663
4664 if (maxcount < 0)
Martin v. Löwis412fb672006-04-13 06:34:32 +00004665 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004666
4667 if (str1->length == 1 && str2->length == 1) {
Martin v. Löwis412fb672006-04-13 06:34:32 +00004668 Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004669
4670 /* replace characters */
Tim Peters7a29bd52001-09-12 03:03:31 +00004671 if (!findchar(self->str, self->length, str1->str[0]) &&
4672 PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004673 /* nothing to replace, return original string */
4674 Py_INCREF(self);
4675 u = self;
4676 } else {
4677 Py_UNICODE u1 = str1->str[0];
4678 Py_UNICODE u2 = str2->str[0];
Tim Petersced69f82003-09-16 20:30:58 +00004679
Guido van Rossumd57fd912000-03-10 22:53:23 +00004680 u = (PyUnicodeObject*) PyUnicode_FromUnicode(
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004681 NULL,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004682 self->length
4683 );
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004684 if (u != NULL) {
Tim Petersced69f82003-09-16 20:30:58 +00004685 Py_UNICODE_COPY(u->str, self->str,
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004686 self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004687 for (i = 0; i < u->length; i++)
4688 if (u->str[i] == u1) {
4689 if (--maxcount < 0)
4690 break;
4691 u->str[i] = u2;
4692 }
4693 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004694 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004695
4696 } else {
Martin v. Löwis18e16552006-02-15 17:27:45 +00004697 Py_ssize_t n, i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004698 Py_UNICODE *p;
4699
4700 /* replace strings */
4701 n = count(self, 0, self->length, str1);
4702 if (n > maxcount)
4703 n = maxcount;
Guido van Rossum2023c9b2002-08-23 18:50:21 +00004704 if (n == 0) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004705 /* nothing to replace, return original string */
Guido van Rossum2023c9b2002-08-23 18:50:21 +00004706 if (PyUnicode_CheckExact(self)) {
4707 Py_INCREF(self);
4708 u = self;
4709 }
4710 else {
4711 u = (PyUnicodeObject *)
4712 PyUnicode_FromUnicode(self->str, self->length);
4713 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004714 } else {
4715 u = _PyUnicode_New(
4716 self->length + n * (str2->length - str1->length));
4717 if (u) {
4718 i = 0;
4719 p = u->str;
Guido van Rossum8b1a6d62002-08-23 18:21:28 +00004720 if (str1->length > 0) {
4721 while (i <= self->length - str1->length)
4722 if (Py_UNICODE_MATCH(self, i, str1)) {
4723 /* replace string segment */
4724 Py_UNICODE_COPY(p, str2->str, str2->length);
4725 p += str2->length;
4726 i += str1->length;
4727 if (--n <= 0) {
4728 /* copy remaining part */
4729 Py_UNICODE_COPY(p, self->str+i, self->length-i);
4730 break;
4731 }
4732 } else
4733 *p++ = self->str[i++];
4734 } else {
4735 while (n > 0) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004736 Py_UNICODE_COPY(p, str2->str, str2->length);
4737 p += str2->length;
Guido van Rossum8b1a6d62002-08-23 18:21:28 +00004738 if (--n <= 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004739 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004740 *p++ = self->str[i++];
Guido van Rossum8b1a6d62002-08-23 18:21:28 +00004741 }
4742 Py_UNICODE_COPY(p, self->str+i, self->length-i);
4743 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004744 }
4745 }
4746 }
Tim Petersced69f82003-09-16 20:30:58 +00004747
Guido van Rossumd57fd912000-03-10 22:53:23 +00004748 return (PyObject *) u;
4749}
4750
4751/* --- Unicode Object Methods --------------------------------------------- */
4752
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004753PyDoc_STRVAR(title__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004754"S.title() -> unicode\n\
4755\n\
4756Return a titlecased version of S, i.e. words start with title case\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004757characters, all remaining cased characters have lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00004758
4759static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004760unicode_title(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004761{
Guido van Rossumd57fd912000-03-10 22:53:23 +00004762 return fixup(self, fixtitle);
4763}
4764
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004765PyDoc_STRVAR(capitalize__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004766"S.capitalize() -> unicode\n\
4767\n\
4768Return a capitalized version of S, i.e. make the first character\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004769have upper case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00004770
4771static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004772unicode_capitalize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004773{
Guido van Rossumd57fd912000-03-10 22:53:23 +00004774 return fixup(self, fixcapitalize);
4775}
4776
4777#if 0
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004778PyDoc_STRVAR(capwords__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004779"S.capwords() -> unicode\n\
4780\n\
4781Apply .capitalize() to all words in S and return the result with\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004782normalized whitespace (all whitespace strings are replaced by ' ').");
Guido van Rossumd57fd912000-03-10 22:53:23 +00004783
4784static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004785unicode_capwords(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004786{
4787 PyObject *list;
4788 PyObject *item;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004789 Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004790
Guido van Rossumd57fd912000-03-10 22:53:23 +00004791 /* Split into words */
4792 list = split(self, NULL, -1);
4793 if (!list)
4794 return NULL;
4795
4796 /* Capitalize each word */
4797 for (i = 0; i < PyList_GET_SIZE(list); i++) {
4798 item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
4799 fixcapitalize);
4800 if (item == NULL)
4801 goto onError;
4802 Py_DECREF(PyList_GET_ITEM(list, i));
4803 PyList_SET_ITEM(list, i, item);
4804 }
4805
4806 /* Join the words to form a new string */
4807 item = PyUnicode_Join(NULL, list);
4808
4809onError:
4810 Py_DECREF(list);
4811 return (PyObject *)item;
4812}
4813#endif
4814
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00004815/* Argument converter. Coerces to a single unicode character */
4816
4817static int
4818convert_uc(PyObject *obj, void *addr)
4819{
4820 Py_UNICODE *fillcharloc = (Py_UNICODE *)addr;
4821 PyObject *uniobj;
4822 Py_UNICODE *unistr;
4823
4824 uniobj = PyUnicode_FromObject(obj);
4825 if (uniobj == NULL) {
4826 PyErr_SetString(PyExc_TypeError,
4827 "The fill character cannot be converted to Unicode");
4828 return 0;
4829 }
4830 if (PyUnicode_GET_SIZE(uniobj) != 1) {
4831 PyErr_SetString(PyExc_TypeError,
4832 "The fill character must be exactly one character long");
4833 Py_DECREF(uniobj);
4834 return 0;
4835 }
4836 unistr = PyUnicode_AS_UNICODE(uniobj);
4837 *fillcharloc = unistr[0];
4838 Py_DECREF(uniobj);
4839 return 1;
4840}
4841
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004842PyDoc_STRVAR(center__doc__,
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00004843"S.center(width[, fillchar]) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00004844\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00004845Return S centered in a Unicode string of length width. Padding is\n\
4846done using the specified fill character (default is a space)");
Guido van Rossumd57fd912000-03-10 22:53:23 +00004847
4848static PyObject *
4849unicode_center(PyUnicodeObject *self, PyObject *args)
4850{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004851 Py_ssize_t marg, left;
4852 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00004853 Py_UNICODE fillchar = ' ';
Guido van Rossumd57fd912000-03-10 22:53:23 +00004854
Thomas Woutersde017742006-02-16 19:34:37 +00004855 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00004856 return NULL;
4857
Tim Peters7a29bd52001-09-12 03:03:31 +00004858 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004859 Py_INCREF(self);
4860 return (PyObject*) self;
4861 }
4862
4863 marg = width - self->length;
4864 left = marg / 2 + (marg & width & 1);
4865
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00004866 return (PyObject*) pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004867}
4868
Marc-André Lemburge5034372000-08-08 08:04:29 +00004869#if 0
4870
4871/* This code should go into some future Unicode collation support
4872 module. The basic comparison should compare ordinals on a naive
Trent Mick20abf572000-08-12 22:14:34 +00004873 basis (this is what Java does and thus JPython too). */
Marc-André Lemburge5034372000-08-08 08:04:29 +00004874
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00004875/* speedy UTF-16 code point order comparison */
4876/* gleaned from: */
4877/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
4878
Marc-André Lemburge12896e2000-07-07 17:51:08 +00004879static short utf16Fixup[32] =
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00004880{
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00004881 0, 0, 0, 0, 0, 0, 0, 0,
Tim Petersced69f82003-09-16 20:30:58 +00004882 0, 0, 0, 0, 0, 0, 0, 0,
4883 0, 0, 0, 0, 0, 0, 0, 0,
Marc-André Lemburge12896e2000-07-07 17:51:08 +00004884 0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00004885};
4886
Guido van Rossumd57fd912000-03-10 22:53:23 +00004887static int
4888unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
4889{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004890 Py_ssize_t len1, len2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00004891
Guido van Rossumd57fd912000-03-10 22:53:23 +00004892 Py_UNICODE *s1 = str1->str;
4893 Py_UNICODE *s2 = str2->str;
4894
4895 len1 = str1->length;
4896 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00004897
Guido van Rossumd57fd912000-03-10 22:53:23 +00004898 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00004899 Py_UNICODE c1, c2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00004900
4901 c1 = *s1++;
4902 c2 = *s2++;
Fredrik Lundh45714e92001-06-26 16:39:36 +00004903
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00004904 if (c1 > (1<<11) * 26)
4905 c1 += utf16Fixup[c1>>11];
4906 if (c2 > (1<<11) * 26)
4907 c2 += utf16Fixup[c2>>11];
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00004908 /* now c1 and c2 are in UTF-32-compatible order */
Fredrik Lundh45714e92001-06-26 16:39:36 +00004909
4910 if (c1 != c2)
4911 return (c1 < c2) ? -1 : 1;
Tim Petersced69f82003-09-16 20:30:58 +00004912
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00004913 len1--; len2--;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004914 }
4915
4916 return (len1 < len2) ? -1 : (len1 != len2);
4917}
4918
Marc-André Lemburge5034372000-08-08 08:04:29 +00004919#else
4920
4921static int
4922unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
4923{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004924 register Py_ssize_t len1, len2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00004925
4926 Py_UNICODE *s1 = str1->str;
4927 Py_UNICODE *s2 = str2->str;
4928
4929 len1 = str1->length;
4930 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00004931
Marc-André Lemburge5034372000-08-08 08:04:29 +00004932 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00004933 Py_UNICODE c1, c2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00004934
Fredrik Lundh45714e92001-06-26 16:39:36 +00004935 c1 = *s1++;
4936 c2 = *s2++;
4937
4938 if (c1 != c2)
4939 return (c1 < c2) ? -1 : 1;
4940
Marc-André Lemburge5034372000-08-08 08:04:29 +00004941 len1--; len2--;
4942 }
4943
4944 return (len1 < len2) ? -1 : (len1 != len2);
4945}
4946
4947#endif
4948
Guido van Rossumd57fd912000-03-10 22:53:23 +00004949int PyUnicode_Compare(PyObject *left,
4950 PyObject *right)
4951{
4952 PyUnicodeObject *u = NULL, *v = NULL;
4953 int result;
4954
4955 /* Coerce the two arguments */
4956 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
4957 if (u == NULL)
4958 goto onError;
4959 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
4960 if (v == NULL)
4961 goto onError;
4962
Thomas Wouters7e474022000-07-16 12:04:32 +00004963 /* Shortcut for empty or interned objects */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004964 if (v == u) {
4965 Py_DECREF(u);
4966 Py_DECREF(v);
4967 return 0;
4968 }
4969
4970 result = unicode_compare(u, v);
4971
4972 Py_DECREF(u);
4973 Py_DECREF(v);
4974 return result;
4975
4976onError:
4977 Py_XDECREF(u);
4978 Py_XDECREF(v);
4979 return -1;
4980}
4981
Guido van Rossum403d68b2000-03-13 15:55:09 +00004982int PyUnicode_Contains(PyObject *container,
4983 PyObject *element)
4984{
4985 PyUnicodeObject *u = NULL, *v = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004986 int result;
4987 Py_ssize_t size;
Barry Warsaw817918c2002-08-06 16:58:21 +00004988 register const Py_UNICODE *lhs, *end, *rhs;
Guido van Rossum403d68b2000-03-13 15:55:09 +00004989
4990 /* Coerce the two arguments */
Guido van Rossum403d68b2000-03-13 15:55:09 +00004991 v = (PyUnicodeObject *)PyUnicode_FromObject(element);
Marc-André Lemburg7c014682000-06-28 08:11:47 +00004992 if (v == NULL) {
4993 PyErr_SetString(PyExc_TypeError,
Barry Warsaw817918c2002-08-06 16:58:21 +00004994 "'in <string>' requires string as left operand");
Guido van Rossum403d68b2000-03-13 15:55:09 +00004995 goto onError;
Marc-André Lemburg7c014682000-06-28 08:11:47 +00004996 }
Guido van Rossum9e896b32000-04-05 20:11:21 +00004997 u = (PyUnicodeObject *)PyUnicode_FromObject(container);
Marc-André Lemburg9cd87aa2002-10-23 09:02:46 +00004998 if (u == NULL)
Guido van Rossum9e896b32000-04-05 20:11:21 +00004999 goto onError;
Guido van Rossum403d68b2000-03-13 15:55:09 +00005000
Barry Warsaw817918c2002-08-06 16:58:21 +00005001 size = PyUnicode_GET_SIZE(v);
5002 rhs = PyUnicode_AS_UNICODE(v);
5003 lhs = PyUnicode_AS_UNICODE(u);
5004
Guido van Rossum403d68b2000-03-13 15:55:09 +00005005 result = 0;
Barry Warsaw817918c2002-08-06 16:58:21 +00005006 if (size == 1) {
5007 end = lhs + PyUnicode_GET_SIZE(u);
5008 while (lhs < end) {
5009 if (*lhs++ == *rhs) {
5010 result = 1;
5011 break;
5012 }
5013 }
5014 }
5015 else {
5016 end = lhs + (PyUnicode_GET_SIZE(u) - size);
5017 while (lhs <= end) {
Barry Warsaw6a043f32002-08-06 19:03:17 +00005018 if (memcmp(lhs++, rhs, size * sizeof(Py_UNICODE)) == 0) {
Barry Warsaw817918c2002-08-06 16:58:21 +00005019 result = 1;
5020 break;
5021 }
Guido van Rossum403d68b2000-03-13 15:55:09 +00005022 }
5023 }
5024
5025 Py_DECREF(u);
5026 Py_DECREF(v);
5027 return result;
5028
5029onError:
5030 Py_XDECREF(u);
5031 Py_XDECREF(v);
5032 return -1;
5033}
5034
Guido van Rossumd57fd912000-03-10 22:53:23 +00005035/* Concat to string or Unicode object giving a new Unicode object. */
5036
5037PyObject *PyUnicode_Concat(PyObject *left,
5038 PyObject *right)
5039{
5040 PyUnicodeObject *u = NULL, *v = NULL, *w;
5041
5042 /* Coerce the two arguments */
5043 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
5044 if (u == NULL)
5045 goto onError;
5046 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
5047 if (v == NULL)
5048 goto onError;
5049
5050 /* Shortcuts */
5051 if (v == unicode_empty) {
5052 Py_DECREF(v);
5053 return (PyObject *)u;
5054 }
5055 if (u == unicode_empty) {
5056 Py_DECREF(u);
5057 return (PyObject *)v;
5058 }
5059
5060 /* Concat the two Unicode strings */
5061 w = _PyUnicode_New(u->length + v->length);
5062 if (w == NULL)
5063 goto onError;
5064 Py_UNICODE_COPY(w->str, u->str, u->length);
5065 Py_UNICODE_COPY(w->str + u->length, v->str, v->length);
5066
5067 Py_DECREF(u);
5068 Py_DECREF(v);
5069 return (PyObject *)w;
5070
5071onError:
5072 Py_XDECREF(u);
5073 Py_XDECREF(v);
5074 return NULL;
5075}
5076
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005077PyDoc_STRVAR(count__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005078"S.count(sub[, start[, end]]) -> int\n\
5079\n\
Fredrik Lundh763b50f2006-05-22 15:35:12 +00005080Return the number of non-overlapping occurrences of substring sub in\n\
5081Unicode string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005082interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005083
5084static PyObject *
5085unicode_count(PyUnicodeObject *self, PyObject *args)
5086{
5087 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005088 Py_ssize_t start = 0;
Martin v. Löwis412fb672006-04-13 06:34:32 +00005089 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005090 PyObject *result;
5091
Guido van Rossumb8872e62000-05-09 14:14:27 +00005092 if (!PyArg_ParseTuple(args, "O|O&O&:count", &substring,
5093 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005094 return NULL;
5095
5096 substring = (PyUnicodeObject *)PyUnicode_FromObject(
5097 (PyObject *)substring);
5098 if (substring == NULL)
5099 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +00005100
Guido van Rossumd57fd912000-03-10 22:53:23 +00005101 if (start < 0)
5102 start += self->length;
5103 if (start < 0)
5104 start = 0;
5105 if (end > self->length)
5106 end = self->length;
5107 if (end < 0)
5108 end += self->length;
5109 if (end < 0)
5110 end = 0;
5111
5112 result = PyInt_FromLong((long) count(self, start, end, substring));
5113
5114 Py_DECREF(substring);
5115 return result;
5116}
5117
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005118PyDoc_STRVAR(encode__doc__,
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00005119"S.encode([encoding[,errors]]) -> string or unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005120\n\
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00005121Encodes S using the codec registered for encoding. encoding defaults\n\
5122to the default encoding. errors may be given to set a different error\n\
Fred Drakee4315f52000-05-09 19:53:39 +00005123handling scheme. Default is 'strict' meaning that encoding errors raise\n\
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005124a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
5125'xmlcharrefreplace' as well as any other name registered with\n\
5126codecs.register_error that can handle UnicodeEncodeErrors.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005127
5128static PyObject *
5129unicode_encode(PyUnicodeObject *self, PyObject *args)
5130{
5131 char *encoding = NULL;
5132 char *errors = NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00005133 PyObject *v;
5134
Guido van Rossumd57fd912000-03-10 22:53:23 +00005135 if (!PyArg_ParseTuple(args, "|ss:encode", &encoding, &errors))
5136 return NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00005137 v = PyUnicode_AsEncodedObject((PyObject *)self, encoding, errors);
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00005138 if (v == NULL)
5139 goto onError;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00005140 if (!PyString_Check(v) && !PyUnicode_Check(v)) {
5141 PyErr_Format(PyExc_TypeError,
5142 "encoder did not return a string/unicode object "
5143 "(type=%.400s)",
5144 v->ob_type->tp_name);
5145 Py_DECREF(v);
5146 return NULL;
5147 }
5148 return v;
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00005149
5150 onError:
5151 return NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00005152}
5153
5154PyDoc_STRVAR(decode__doc__,
5155"S.decode([encoding[,errors]]) -> string or unicode\n\
5156\n\
5157Decodes S using the codec registered for encoding. encoding defaults\n\
5158to the default encoding. errors may be given to set a different error\n\
5159handling scheme. Default is 'strict' meaning that encoding errors raise\n\
5160a UnicodeDecodeError. Other possible values are 'ignore' and 'replace'\n\
5161as well as any other name registerd with codecs.register_error that is\n\
5162able to handle UnicodeDecodeErrors.");
5163
5164static PyObject *
Marc-André Lemburg126b44c2004-07-10 12:04:20 +00005165unicode_decode(PyUnicodeObject *self, PyObject *args)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00005166{
5167 char *encoding = NULL;
5168 char *errors = NULL;
5169 PyObject *v;
5170
5171 if (!PyArg_ParseTuple(args, "|ss:decode", &encoding, &errors))
5172 return NULL;
5173 v = PyUnicode_AsDecodedObject((PyObject *)self, encoding, errors);
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00005174 if (v == NULL)
5175 goto onError;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00005176 if (!PyString_Check(v) && !PyUnicode_Check(v)) {
5177 PyErr_Format(PyExc_TypeError,
5178 "decoder did not return a string/unicode object "
5179 "(type=%.400s)",
5180 v->ob_type->tp_name);
5181 Py_DECREF(v);
5182 return NULL;
5183 }
5184 return v;
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00005185
5186 onError:
5187 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005188}
5189
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005190PyDoc_STRVAR(expandtabs__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005191"S.expandtabs([tabsize]) -> unicode\n\
5192\n\
5193Return a copy of S where all tab characters are expanded using spaces.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005194If tabsize is not given, a tab size of 8 characters is assumed.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005195
5196static PyObject*
5197unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
5198{
5199 Py_UNICODE *e;
5200 Py_UNICODE *p;
5201 Py_UNICODE *q;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005202 Py_ssize_t i, j;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005203 PyUnicodeObject *u;
5204 int tabsize = 8;
5205
5206 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
5207 return NULL;
5208
Thomas Wouters7e474022000-07-16 12:04:32 +00005209 /* First pass: determine size of output string */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005210 i = j = 0;
5211 e = self->str + self->length;
5212 for (p = self->str; p < e; p++)
5213 if (*p == '\t') {
5214 if (tabsize > 0)
5215 j += tabsize - (j % tabsize);
5216 }
5217 else {
5218 j++;
5219 if (*p == '\n' || *p == '\r') {
5220 i += j;
5221 j = 0;
5222 }
5223 }
5224
5225 /* Second pass: create output string and fill it */
5226 u = _PyUnicode_New(i + j);
5227 if (!u)
5228 return NULL;
5229
5230 j = 0;
5231 q = u->str;
5232
5233 for (p = self->str; p < e; p++)
5234 if (*p == '\t') {
5235 if (tabsize > 0) {
5236 i = tabsize - (j % tabsize);
5237 j += i;
5238 while (i--)
5239 *q++ = ' ';
5240 }
5241 }
5242 else {
5243 j++;
5244 *q++ = *p;
5245 if (*p == '\n' || *p == '\r')
5246 j = 0;
5247 }
5248
5249 return (PyObject*) u;
5250}
5251
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005252PyDoc_STRVAR(find__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005253"S.find(sub [,start [,end]]) -> int\n\
5254\n\
5255Return the lowest index in S where substring sub is found,\n\
5256such that sub is contained within s[start,end]. Optional\n\
5257arguments start and end are interpreted as in slice notation.\n\
5258\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005259Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005260
5261static PyObject *
5262unicode_find(PyUnicodeObject *self, PyObject *args)
5263{
5264 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005265 Py_ssize_t start = 0;
Martin v. Löwis412fb672006-04-13 06:34:32 +00005266 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005267 PyObject *result;
5268
Guido van Rossumb8872e62000-05-09 14:14:27 +00005269 if (!PyArg_ParseTuple(args, "O|O&O&:find", &substring,
5270 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005271 return NULL;
5272 substring = (PyUnicodeObject *)PyUnicode_FromObject(
5273 (PyObject *)substring);
5274 if (substring == NULL)
5275 return NULL;
5276
Martin v. Löwis18e16552006-02-15 17:27:45 +00005277 result = PyInt_FromSsize_t(findstring(self, substring, start, end, 1));
Guido van Rossumd57fd912000-03-10 22:53:23 +00005278
5279 Py_DECREF(substring);
5280 return result;
5281}
5282
5283static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +00005284unicode_getitem(PyUnicodeObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005285{
5286 if (index < 0 || index >= self->length) {
5287 PyErr_SetString(PyExc_IndexError, "string index out of range");
5288 return NULL;
5289 }
5290
5291 return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1);
5292}
5293
5294static long
5295unicode_hash(PyUnicodeObject *self)
5296{
Fredrik Lundhdde61642000-07-10 18:27:47 +00005297 /* Since Unicode objects compare equal to their ASCII string
5298 counterparts, they should use the individual character values
5299 as basis for their hash value. This is needed to assure that
5300 strings and Unicode objects behave in the same way as
5301 dictionary keys. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005302
Martin v. Löwis18e16552006-02-15 17:27:45 +00005303 register Py_ssize_t len;
Fredrik Lundhdde61642000-07-10 18:27:47 +00005304 register Py_UNICODE *p;
5305 register long x;
5306
Guido van Rossumd57fd912000-03-10 22:53:23 +00005307 if (self->hash != -1)
5308 return self->hash;
Fredrik Lundhdde61642000-07-10 18:27:47 +00005309 len = PyUnicode_GET_SIZE(self);
5310 p = PyUnicode_AS_UNICODE(self);
5311 x = *p << 7;
5312 while (--len >= 0)
5313 x = (1000003*x) ^ *p++;
5314 x ^= PyUnicode_GET_SIZE(self);
5315 if (x == -1)
5316 x = -2;
5317 self->hash = x;
5318 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005319}
5320
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005321PyDoc_STRVAR(index__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005322"S.index(sub [,start [,end]]) -> int\n\
5323\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005324Like S.find() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005325
5326static PyObject *
5327unicode_index(PyUnicodeObject *self, PyObject *args)
5328{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005329 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005330 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005331 Py_ssize_t start = 0;
Martin v. Löwis412fb672006-04-13 06:34:32 +00005332 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005333
Guido van Rossumb8872e62000-05-09 14:14:27 +00005334 if (!PyArg_ParseTuple(args, "O|O&O&:index", &substring,
5335 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005336 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +00005337
Guido van Rossumd57fd912000-03-10 22:53:23 +00005338 substring = (PyUnicodeObject *)PyUnicode_FromObject(
5339 (PyObject *)substring);
5340 if (substring == NULL)
5341 return NULL;
5342
5343 result = findstring(self, substring, start, end, 1);
5344
5345 Py_DECREF(substring);
5346 if (result < 0) {
5347 PyErr_SetString(PyExc_ValueError, "substring not found");
5348 return NULL;
5349 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00005350 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005351}
5352
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005353PyDoc_STRVAR(islower__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005354"S.islower() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005355\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00005356Return True if all cased characters in S are lowercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005357at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005358
5359static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005360unicode_islower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005361{
5362 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5363 register const Py_UNICODE *e;
5364 int cased;
5365
Guido van Rossumd57fd912000-03-10 22:53:23 +00005366 /* Shortcut for single character strings */
5367 if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005368 return PyBool_FromLong(Py_UNICODE_ISLOWER(*p));
Guido van Rossumd57fd912000-03-10 22:53:23 +00005369
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005370 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00005371 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005372 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005373
Guido van Rossumd57fd912000-03-10 22:53:23 +00005374 e = p + PyUnicode_GET_SIZE(self);
5375 cased = 0;
5376 for (; p < e; p++) {
5377 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00005378
Guido van Rossumd57fd912000-03-10 22:53:23 +00005379 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005380 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005381 else if (!cased && Py_UNICODE_ISLOWER(ch))
5382 cased = 1;
5383 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00005384 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005385}
5386
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005387PyDoc_STRVAR(isupper__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005388"S.isupper() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005389\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00005390Return True if all cased characters in S are uppercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005391at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005392
5393static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005394unicode_isupper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005395{
5396 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5397 register const Py_UNICODE *e;
5398 int cased;
5399
Guido van Rossumd57fd912000-03-10 22:53:23 +00005400 /* Shortcut for single character strings */
5401 if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005402 return PyBool_FromLong(Py_UNICODE_ISUPPER(*p) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005403
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005404 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00005405 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005406 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005407
Guido van Rossumd57fd912000-03-10 22:53:23 +00005408 e = p + PyUnicode_GET_SIZE(self);
5409 cased = 0;
5410 for (; p < e; p++) {
5411 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00005412
Guido van Rossumd57fd912000-03-10 22:53:23 +00005413 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005414 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005415 else if (!cased && Py_UNICODE_ISUPPER(ch))
5416 cased = 1;
5417 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00005418 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005419}
5420
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005421PyDoc_STRVAR(istitle__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005422"S.istitle() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005423\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00005424Return True if S is a titlecased string and there is at least one\n\
5425character in S, i.e. upper- and titlecase characters may only\n\
5426follow uncased characters and lowercase characters only cased ones.\n\
5427Return False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005428
5429static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005430unicode_istitle(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005431{
5432 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5433 register const Py_UNICODE *e;
5434 int cased, previous_is_cased;
5435
Guido van Rossumd57fd912000-03-10 22:53:23 +00005436 /* Shortcut for single character strings */
5437 if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005438 return PyBool_FromLong((Py_UNICODE_ISTITLE(*p) != 0) ||
5439 (Py_UNICODE_ISUPPER(*p) != 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +00005440
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005441 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00005442 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005443 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005444
Guido van Rossumd57fd912000-03-10 22:53:23 +00005445 e = p + PyUnicode_GET_SIZE(self);
5446 cased = 0;
5447 previous_is_cased = 0;
5448 for (; p < e; p++) {
5449 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00005450
Guido van Rossumd57fd912000-03-10 22:53:23 +00005451 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
5452 if (previous_is_cased)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005453 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005454 previous_is_cased = 1;
5455 cased = 1;
5456 }
5457 else if (Py_UNICODE_ISLOWER(ch)) {
5458 if (!previous_is_cased)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005459 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005460 previous_is_cased = 1;
5461 cased = 1;
5462 }
5463 else
5464 previous_is_cased = 0;
5465 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00005466 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005467}
5468
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005469PyDoc_STRVAR(isspace__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005470"S.isspace() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005471\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00005472Return True if all characters in S are whitespace\n\
5473and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005474
5475static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005476unicode_isspace(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005477{
5478 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5479 register const Py_UNICODE *e;
5480
Guido van Rossumd57fd912000-03-10 22:53:23 +00005481 /* Shortcut for single character strings */
5482 if (PyUnicode_GET_SIZE(self) == 1 &&
5483 Py_UNICODE_ISSPACE(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005484 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005485
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005486 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00005487 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005488 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005489
Guido van Rossumd57fd912000-03-10 22:53:23 +00005490 e = p + PyUnicode_GET_SIZE(self);
5491 for (; p < e; p++) {
5492 if (!Py_UNICODE_ISSPACE(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005493 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005494 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00005495 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005496}
5497
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005498PyDoc_STRVAR(isalpha__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005499"S.isalpha() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005500\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00005501Return True if all characters in S are alphabetic\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005502and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005503
5504static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005505unicode_isalpha(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005506{
5507 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5508 register const Py_UNICODE *e;
5509
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005510 /* Shortcut for single character strings */
5511 if (PyUnicode_GET_SIZE(self) == 1 &&
5512 Py_UNICODE_ISALPHA(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005513 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005514
5515 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00005516 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005517 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005518
5519 e = p + PyUnicode_GET_SIZE(self);
5520 for (; p < e; p++) {
5521 if (!Py_UNICODE_ISALPHA(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005522 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005523 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00005524 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005525}
5526
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005527PyDoc_STRVAR(isalnum__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005528"S.isalnum() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005529\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00005530Return True if all characters in S are alphanumeric\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005531and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005532
5533static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005534unicode_isalnum(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005535{
5536 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5537 register const Py_UNICODE *e;
5538
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005539 /* Shortcut for single character strings */
5540 if (PyUnicode_GET_SIZE(self) == 1 &&
5541 Py_UNICODE_ISALNUM(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005542 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005543
5544 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00005545 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005546 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005547
5548 e = p + PyUnicode_GET_SIZE(self);
5549 for (; p < e; p++) {
5550 if (!Py_UNICODE_ISALNUM(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005551 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005552 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00005553 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005554}
5555
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005556PyDoc_STRVAR(isdecimal__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005557"S.isdecimal() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005558\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00005559Return True if there are only decimal characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005560False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005561
5562static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005563unicode_isdecimal(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005564{
5565 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5566 register const Py_UNICODE *e;
5567
Guido van Rossumd57fd912000-03-10 22:53:23 +00005568 /* Shortcut for single character strings */
5569 if (PyUnicode_GET_SIZE(self) == 1 &&
5570 Py_UNICODE_ISDECIMAL(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005571 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005572
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005573 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00005574 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005575 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005576
Guido van Rossumd57fd912000-03-10 22:53:23 +00005577 e = p + PyUnicode_GET_SIZE(self);
5578 for (; p < e; p++) {
5579 if (!Py_UNICODE_ISDECIMAL(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005580 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005581 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00005582 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005583}
5584
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005585PyDoc_STRVAR(isdigit__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005586"S.isdigit() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005587\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00005588Return True if all characters in S are digits\n\
5589and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005590
5591static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005592unicode_isdigit(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005593{
5594 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5595 register const Py_UNICODE *e;
5596
Guido van Rossumd57fd912000-03-10 22:53:23 +00005597 /* Shortcut for single character strings */
5598 if (PyUnicode_GET_SIZE(self) == 1 &&
5599 Py_UNICODE_ISDIGIT(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005600 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005601
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005602 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00005603 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005604 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005605
Guido van Rossumd57fd912000-03-10 22:53:23 +00005606 e = p + PyUnicode_GET_SIZE(self);
5607 for (; p < e; p++) {
5608 if (!Py_UNICODE_ISDIGIT(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005609 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005610 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00005611 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005612}
5613
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005614PyDoc_STRVAR(isnumeric__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005615"S.isnumeric() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005616\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00005617Return True if there are only numeric characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005618False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005619
5620static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005621unicode_isnumeric(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005622{
5623 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5624 register const Py_UNICODE *e;
5625
Guido van Rossumd57fd912000-03-10 22:53:23 +00005626 /* Shortcut for single character strings */
5627 if (PyUnicode_GET_SIZE(self) == 1 &&
5628 Py_UNICODE_ISNUMERIC(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005629 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005630
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005631 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00005632 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005633 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005634
Guido van Rossumd57fd912000-03-10 22:53:23 +00005635 e = p + PyUnicode_GET_SIZE(self);
5636 for (; p < e; p++) {
5637 if (!Py_UNICODE_ISNUMERIC(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005638 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005639 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00005640 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005641}
5642
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005643PyDoc_STRVAR(join__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005644"S.join(sequence) -> unicode\n\
5645\n\
5646Return a string which is the concatenation of the strings in the\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005647sequence. The separator between elements is S.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005648
5649static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005650unicode_join(PyObject *self, PyObject *data)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005651{
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005652 return PyUnicode_Join(self, data);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005653}
5654
Martin v. Löwis18e16552006-02-15 17:27:45 +00005655static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +00005656unicode_length(PyUnicodeObject *self)
5657{
5658 return self->length;
5659}
5660
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005661PyDoc_STRVAR(ljust__doc__,
Hye-Shik Chang974ed7c2004-06-02 16:49:17 +00005662"S.ljust(width[, fillchar]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005663\n\
5664Return S left justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00005665done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005666
5667static PyObject *
5668unicode_ljust(PyUnicodeObject *self, PyObject *args)
5669{
Martin v. Löwis412fb672006-04-13 06:34:32 +00005670 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00005671 Py_UNICODE fillchar = ' ';
5672
Martin v. Löwis412fb672006-04-13 06:34:32 +00005673 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005674 return NULL;
5675
Tim Peters7a29bd52001-09-12 03:03:31 +00005676 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005677 Py_INCREF(self);
5678 return (PyObject*) self;
5679 }
5680
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00005681 return (PyObject*) pad(self, 0, width - self->length, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005682}
5683
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005684PyDoc_STRVAR(lower__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005685"S.lower() -> unicode\n\
5686\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005687Return a copy of the string S converted to lowercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005688
5689static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005690unicode_lower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005691{
Guido van Rossumd57fd912000-03-10 22:53:23 +00005692 return fixup(self, fixlower);
5693}
5694
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005695#define LEFTSTRIP 0
5696#define RIGHTSTRIP 1
5697#define BOTHSTRIP 2
5698
5699/* Arrays indexed by above */
5700static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
5701
5702#define STRIPNAME(i) (stripformat[i]+3)
5703
5704static const Py_UNICODE *
5705unicode_memchr(const Py_UNICODE *s, Py_UNICODE c, size_t n)
5706{
Tim Peters030a5ce2002-04-22 19:00:10 +00005707 size_t i;
5708 for (i = 0; i < n; ++i)
5709 if (s[i] == c)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005710 return s+i;
5711 return NULL;
5712}
5713
5714/* externally visible for str.strip(unicode) */
5715PyObject *
5716_PyUnicode_XStrip(PyUnicodeObject *self, int striptype, PyObject *sepobj)
5717{
5718 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
Martin v. Löwis18e16552006-02-15 17:27:45 +00005719 Py_ssize_t len = PyUnicode_GET_SIZE(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005720 Py_UNICODE *sep = PyUnicode_AS_UNICODE(sepobj);
Martin v. Löwis18e16552006-02-15 17:27:45 +00005721 Py_ssize_t seplen = PyUnicode_GET_SIZE(sepobj);
5722 Py_ssize_t i, j;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005723
5724 i = 0;
5725 if (striptype != RIGHTSTRIP) {
5726 while (i < len && unicode_memchr(sep, s[i], seplen)) {
5727 i++;
5728 }
5729 }
5730
5731 j = len;
5732 if (striptype != LEFTSTRIP) {
5733 do {
5734 j--;
5735 } while (j >= i && unicode_memchr(sep, s[j], seplen));
5736 j++;
5737 }
5738
5739 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
5740 Py_INCREF(self);
5741 return (PyObject*)self;
5742 }
5743 else
5744 return PyUnicode_FromUnicode(s+i, j-i);
5745}
5746
Guido van Rossumd57fd912000-03-10 22:53:23 +00005747
5748static PyObject *
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005749do_strip(PyUnicodeObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005750{
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005751 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
Martin v. Löwis18e16552006-02-15 17:27:45 +00005752 Py_ssize_t len = PyUnicode_GET_SIZE(self), i, j;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005753
5754 i = 0;
5755 if (striptype != RIGHTSTRIP) {
5756 while (i < len && Py_UNICODE_ISSPACE(s[i])) {
5757 i++;
5758 }
5759 }
5760
5761 j = len;
5762 if (striptype != LEFTSTRIP) {
5763 do {
5764 j--;
5765 } while (j >= i && Py_UNICODE_ISSPACE(s[j]));
5766 j++;
5767 }
5768
5769 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
5770 Py_INCREF(self);
5771 return (PyObject*)self;
5772 }
5773 else
5774 return PyUnicode_FromUnicode(s+i, j-i);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005775}
5776
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005777
5778static PyObject *
5779do_argstrip(PyUnicodeObject *self, int striptype, PyObject *args)
5780{
5781 PyObject *sep = NULL;
5782
5783 if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
5784 return NULL;
5785
5786 if (sep != NULL && sep != Py_None) {
5787 if (PyUnicode_Check(sep))
5788 return _PyUnicode_XStrip(self, striptype, sep);
5789 else if (PyString_Check(sep)) {
5790 PyObject *res;
5791 sep = PyUnicode_FromObject(sep);
5792 if (sep==NULL)
5793 return NULL;
5794 res = _PyUnicode_XStrip(self, striptype, sep);
5795 Py_DECREF(sep);
5796 return res;
5797 }
5798 else {
5799 PyErr_Format(PyExc_TypeError,
5800 "%s arg must be None, unicode or str",
5801 STRIPNAME(striptype));
5802 return NULL;
5803 }
5804 }
5805
5806 return do_strip(self, striptype);
5807}
5808
5809
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005810PyDoc_STRVAR(strip__doc__,
Neal Norwitzffe33b72003-04-10 22:35:32 +00005811"S.strip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005812\n\
5813Return a copy of the string S with leading and trailing\n\
5814whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00005815If chars is given and not None, remove characters in chars instead.\n\
5816If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005817
5818static PyObject *
5819unicode_strip(PyUnicodeObject *self, PyObject *args)
5820{
5821 if (PyTuple_GET_SIZE(args) == 0)
5822 return do_strip(self, BOTHSTRIP); /* Common case */
5823 else
5824 return do_argstrip(self, BOTHSTRIP, args);
5825}
5826
5827
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005828PyDoc_STRVAR(lstrip__doc__,
Neal Norwitzffe33b72003-04-10 22:35:32 +00005829"S.lstrip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005830\n\
5831Return a copy of the string S with leading whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00005832If chars is given and not None, remove characters in chars instead.\n\
5833If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005834
5835static PyObject *
5836unicode_lstrip(PyUnicodeObject *self, PyObject *args)
5837{
5838 if (PyTuple_GET_SIZE(args) == 0)
5839 return do_strip(self, LEFTSTRIP); /* Common case */
5840 else
5841 return do_argstrip(self, LEFTSTRIP, args);
5842}
5843
5844
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005845PyDoc_STRVAR(rstrip__doc__,
Neal Norwitzffe33b72003-04-10 22:35:32 +00005846"S.rstrip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005847\n\
5848Return a copy of the string S with trailing whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00005849If chars is given and not None, remove characters in chars instead.\n\
5850If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005851
5852static PyObject *
5853unicode_rstrip(PyUnicodeObject *self, PyObject *args)
5854{
5855 if (PyTuple_GET_SIZE(args) == 0)
5856 return do_strip(self, RIGHTSTRIP); /* Common case */
5857 else
5858 return do_argstrip(self, RIGHTSTRIP, args);
5859}
5860
5861
Guido van Rossumd57fd912000-03-10 22:53:23 +00005862static PyObject*
Martin v. Löwis18e16552006-02-15 17:27:45 +00005863unicode_repeat(PyUnicodeObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005864{
5865 PyUnicodeObject *u;
5866 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005867 Py_ssize_t nchars;
Tim Peters8f422462000-09-09 06:13:41 +00005868 size_t nbytes;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005869
5870 if (len < 0)
5871 len = 0;
5872
Tim Peters7a29bd52001-09-12 03:03:31 +00005873 if (len == 1 && PyUnicode_CheckExact(str)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005874 /* no repeat, return original string */
5875 Py_INCREF(str);
5876 return (PyObject*) str;
5877 }
Tim Peters8f422462000-09-09 06:13:41 +00005878
5879 /* ensure # of chars needed doesn't overflow int and # of bytes
5880 * needed doesn't overflow size_t
5881 */
5882 nchars = len * str->length;
5883 if (len && nchars / len != str->length) {
5884 PyErr_SetString(PyExc_OverflowError,
5885 "repeated string is too long");
5886 return NULL;
5887 }
5888 nbytes = (nchars + 1) * sizeof(Py_UNICODE);
5889 if (nbytes / sizeof(Py_UNICODE) != (size_t)(nchars + 1)) {
5890 PyErr_SetString(PyExc_OverflowError,
5891 "repeated string is too long");
5892 return NULL;
5893 }
5894 u = _PyUnicode_New(nchars);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005895 if (!u)
5896 return NULL;
5897
5898 p = u->str;
5899
Fredrik Lundhf1d60a52006-05-22 16:29:30 +00005900 if (str->length == 1 && len > 0) {
5901 Py_UNICODE_FILL(p, str->str[0], len);
Fredrik Lundh8a8e05a2006-05-22 17:12:58 +00005902 } else {
Tim Peters1bacc642006-05-23 05:47:16 +00005903 Py_ssize_t done = 0; /* number of characters copied this far */
Fredrik Lundh8a8e05a2006-05-22 17:12:58 +00005904 if (done < nchars) {
Fredrik Lundhf1d60a52006-05-22 16:29:30 +00005905 Py_UNICODE_COPY(p, str->str, str->length);
Fredrik Lundh8a8e05a2006-05-22 17:12:58 +00005906 done = str->length;
5907 }
5908 while (done < nchars) {
5909 int n = (done <= nchars-done) ? done : nchars-done;
5910 Py_UNICODE_COPY(p+done, p, n);
5911 done += n;
5912 }
5913 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005914
5915 return (PyObject*) u;
5916}
5917
5918PyObject *PyUnicode_Replace(PyObject *obj,
5919 PyObject *subobj,
5920 PyObject *replobj,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005921 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005922{
5923 PyObject *self;
5924 PyObject *str1;
5925 PyObject *str2;
5926 PyObject *result;
5927
5928 self = PyUnicode_FromObject(obj);
5929 if (self == NULL)
5930 return NULL;
5931 str1 = PyUnicode_FromObject(subobj);
5932 if (str1 == NULL) {
5933 Py_DECREF(self);
5934 return NULL;
5935 }
5936 str2 = PyUnicode_FromObject(replobj);
5937 if (str2 == NULL) {
5938 Py_DECREF(self);
5939 Py_DECREF(str1);
5940 return NULL;
5941 }
Tim Petersced69f82003-09-16 20:30:58 +00005942 result = replace((PyUnicodeObject *)self,
5943 (PyUnicodeObject *)str1,
5944 (PyUnicodeObject *)str2,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005945 maxcount);
5946 Py_DECREF(self);
5947 Py_DECREF(str1);
5948 Py_DECREF(str2);
5949 return result;
5950}
5951
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005952PyDoc_STRVAR(replace__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005953"S.replace (old, new[, maxsplit]) -> unicode\n\
5954\n\
5955Return a copy of S with all occurrences of substring\n\
5956old replaced by new. If the optional argument maxsplit is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005957given, only the first maxsplit occurrences are replaced.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005958
5959static PyObject*
5960unicode_replace(PyUnicodeObject *self, PyObject *args)
5961{
5962 PyUnicodeObject *str1;
5963 PyUnicodeObject *str2;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005964 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005965 PyObject *result;
5966
Martin v. Löwis18e16552006-02-15 17:27:45 +00005967 if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005968 return NULL;
5969 str1 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str1);
5970 if (str1 == NULL)
5971 return NULL;
5972 str2 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str2);
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00005973 if (str2 == NULL) {
5974 Py_DECREF(str1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005975 return NULL;
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00005976 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005977
5978 result = replace(self, str1, str2, maxcount);
5979
5980 Py_DECREF(str1);
5981 Py_DECREF(str2);
5982 return result;
5983}
5984
5985static
5986PyObject *unicode_repr(PyObject *unicode)
5987{
5988 return unicodeescape_string(PyUnicode_AS_UNICODE(unicode),
5989 PyUnicode_GET_SIZE(unicode),
5990 1);
5991}
5992
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005993PyDoc_STRVAR(rfind__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005994"S.rfind(sub [,start [,end]]) -> int\n\
5995\n\
5996Return the highest index in S where substring sub is found,\n\
5997such that sub is contained within s[start,end]. Optional\n\
5998arguments start and end are interpreted as in slice notation.\n\
5999\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006000Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006001
6002static PyObject *
6003unicode_rfind(PyUnicodeObject *self, PyObject *args)
6004{
6005 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006006 Py_ssize_t start = 0;
Martin v. Löwis412fb672006-04-13 06:34:32 +00006007 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006008 PyObject *result;
6009
Guido van Rossumb8872e62000-05-09 14:14:27 +00006010 if (!PyArg_ParseTuple(args, "O|O&O&:rfind", &substring,
6011 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006012 return NULL;
6013 substring = (PyUnicodeObject *)PyUnicode_FromObject(
6014 (PyObject *)substring);
6015 if (substring == NULL)
6016 return NULL;
6017
Martin v. Löwis18e16552006-02-15 17:27:45 +00006018 result = PyInt_FromSsize_t(findstring(self, substring, start, end, -1));
Guido van Rossumd57fd912000-03-10 22:53:23 +00006019
6020 Py_DECREF(substring);
6021 return result;
6022}
6023
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006024PyDoc_STRVAR(rindex__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006025"S.rindex(sub [,start [,end]]) -> int\n\
6026\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006027Like S.rfind() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006028
6029static PyObject *
6030unicode_rindex(PyUnicodeObject *self, PyObject *args)
6031{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006032 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006033 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006034 Py_ssize_t start = 0;
Martin v. Löwis412fb672006-04-13 06:34:32 +00006035 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006036
Guido van Rossumb8872e62000-05-09 14:14:27 +00006037 if (!PyArg_ParseTuple(args, "O|O&O&:rindex", &substring,
6038 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006039 return NULL;
6040 substring = (PyUnicodeObject *)PyUnicode_FromObject(
6041 (PyObject *)substring);
6042 if (substring == NULL)
6043 return NULL;
6044
6045 result = findstring(self, substring, start, end, -1);
6046
6047 Py_DECREF(substring);
6048 if (result < 0) {
6049 PyErr_SetString(PyExc_ValueError, "substring not found");
6050 return NULL;
6051 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00006052 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006053}
6054
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006055PyDoc_STRVAR(rjust__doc__,
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006056"S.rjust(width[, fillchar]) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006057\n\
6058Return S right justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006059done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006060
6061static PyObject *
6062unicode_rjust(PyUnicodeObject *self, PyObject *args)
6063{
Martin v. Löwis412fb672006-04-13 06:34:32 +00006064 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006065 Py_UNICODE fillchar = ' ';
6066
Martin v. Löwis412fb672006-04-13 06:34:32 +00006067 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006068 return NULL;
6069
Tim Peters7a29bd52001-09-12 03:03:31 +00006070 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006071 Py_INCREF(self);
6072 return (PyObject*) self;
6073 }
6074
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006075 return (PyObject*) pad(self, width - self->length, 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006076}
6077
Guido van Rossumd57fd912000-03-10 22:53:23 +00006078static PyObject*
Martin v. Löwis18e16552006-02-15 17:27:45 +00006079unicode_slice(PyUnicodeObject *self, Py_ssize_t start, Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006080{
6081 /* standard clamping */
6082 if (start < 0)
6083 start = 0;
6084 if (end < 0)
6085 end = 0;
6086 if (end > self->length)
6087 end = self->length;
Tim Peters7a29bd52001-09-12 03:03:31 +00006088 if (start == 0 && end == self->length && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006089 /* full slice, return original string */
6090 Py_INCREF(self);
6091 return (PyObject*) self;
6092 }
6093 if (start > end)
6094 start = end;
6095 /* copy slice */
6096 return (PyObject*) PyUnicode_FromUnicode(self->str + start,
6097 end - start);
6098}
6099
6100PyObject *PyUnicode_Split(PyObject *s,
6101 PyObject *sep,
Martin v. Löwis18e16552006-02-15 17:27:45 +00006102 Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006103{
6104 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00006105
Guido van Rossumd57fd912000-03-10 22:53:23 +00006106 s = PyUnicode_FromObject(s);
6107 if (s == NULL)
6108 return NULL;
6109 if (sep != NULL) {
6110 sep = PyUnicode_FromObject(sep);
6111 if (sep == NULL) {
6112 Py_DECREF(s);
6113 return NULL;
6114 }
6115 }
6116
6117 result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
6118
6119 Py_DECREF(s);
6120 Py_XDECREF(sep);
6121 return result;
6122}
6123
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006124PyDoc_STRVAR(split__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006125"S.split([sep [,maxsplit]]) -> list of strings\n\
6126\n\
6127Return a list of the words in S, using sep as the\n\
6128delimiter string. If maxsplit is given, at most maxsplit\n\
Thomas Hellerca0d2cb2004-09-15 11:41:32 +00006129splits are done. If sep is not specified or is None,\n\
Walter Dörwald782afc52004-09-14 09:40:45 +00006130any whitespace string is a separator.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006131
6132static PyObject*
6133unicode_split(PyUnicodeObject *self, PyObject *args)
6134{
6135 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006136 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006137
Martin v. Löwis18e16552006-02-15 17:27:45 +00006138 if (!PyArg_ParseTuple(args, "|On:split", &substring, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006139 return NULL;
6140
6141 if (substring == Py_None)
6142 return split(self, NULL, maxcount);
6143 else if (PyUnicode_Check(substring))
6144 return split(self, (PyUnicodeObject *)substring, maxcount);
6145 else
6146 return PyUnicode_Split((PyObject *)self, substring, maxcount);
6147}
6148
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006149PyObject *PyUnicode_RSplit(PyObject *s,
6150 PyObject *sep,
Martin v. Löwis18e16552006-02-15 17:27:45 +00006151 Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006152{
6153 PyObject *result;
6154
6155 s = PyUnicode_FromObject(s);
6156 if (s == NULL)
6157 return NULL;
6158 if (sep != NULL) {
6159 sep = PyUnicode_FromObject(sep);
6160 if (sep == NULL) {
6161 Py_DECREF(s);
6162 return NULL;
6163 }
6164 }
6165
6166 result = rsplit((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
6167
6168 Py_DECREF(s);
6169 Py_XDECREF(sep);
6170 return result;
6171}
6172
6173PyDoc_STRVAR(rsplit__doc__,
6174"S.rsplit([sep [,maxsplit]]) -> list of strings\n\
6175\n\
6176Return a list of the words in S, using sep as the\n\
6177delimiter string, starting at the end of the string and\n\
6178working to the front. If maxsplit is given, at most maxsplit\n\
6179splits are done. If sep is not specified, any whitespace string\n\
6180is a separator.");
6181
6182static PyObject*
6183unicode_rsplit(PyUnicodeObject *self, PyObject *args)
6184{
6185 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006186 Py_ssize_t maxcount = -1;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006187
Martin v. Löwis18e16552006-02-15 17:27:45 +00006188 if (!PyArg_ParseTuple(args, "|On:rsplit", &substring, &maxcount))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006189 return NULL;
6190
6191 if (substring == Py_None)
6192 return rsplit(self, NULL, maxcount);
6193 else if (PyUnicode_Check(substring))
6194 return rsplit(self, (PyUnicodeObject *)substring, maxcount);
6195 else
6196 return PyUnicode_RSplit((PyObject *)self, substring, maxcount);
6197}
6198
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006199PyDoc_STRVAR(splitlines__doc__,
Guido van Rossum86662912000-04-11 15:38:46 +00006200"S.splitlines([keepends]]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006201\n\
6202Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +00006203Line breaks are not included in the resulting list unless keepends\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006204is given and true.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006205
6206static PyObject*
6207unicode_splitlines(PyUnicodeObject *self, PyObject *args)
6208{
Guido van Rossum86662912000-04-11 15:38:46 +00006209 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006210
Guido van Rossum86662912000-04-11 15:38:46 +00006211 if (!PyArg_ParseTuple(args, "|i:splitlines", &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006212 return NULL;
6213
Guido van Rossum86662912000-04-11 15:38:46 +00006214 return PyUnicode_Splitlines((PyObject *)self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006215}
6216
6217static
6218PyObject *unicode_str(PyUnicodeObject *self)
6219{
Fred Drakee4315f52000-05-09 19:53:39 +00006220 return PyUnicode_AsEncodedString((PyObject *)self, NULL, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006221}
6222
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006223PyDoc_STRVAR(swapcase__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006224"S.swapcase() -> unicode\n\
6225\n\
6226Return a copy of S with uppercase characters converted to lowercase\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006227and vice versa.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006228
6229static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006230unicode_swapcase(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006231{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006232 return fixup(self, fixswapcase);
6233}
6234
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006235PyDoc_STRVAR(translate__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006236"S.translate(table) -> unicode\n\
6237\n\
6238Return a copy of the string S, where all characters have been mapped\n\
6239through the given translation table, which must be a mapping of\n\
Walter Dörwald5c1ee172002-09-04 20:31:32 +00006240Unicode ordinals to Unicode ordinals, Unicode strings or None.\n\
6241Unmapped characters are left untouched. Characters mapped to None\n\
6242are deleted.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006243
6244static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006245unicode_translate(PyUnicodeObject *self, PyObject *table)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006246{
Tim Petersced69f82003-09-16 20:30:58 +00006247 return PyUnicode_TranslateCharmap(self->str,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006248 self->length,
Tim Petersced69f82003-09-16 20:30:58 +00006249 table,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006250 "ignore");
6251}
6252
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006253PyDoc_STRVAR(upper__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006254"S.upper() -> unicode\n\
6255\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006256Return a copy of S converted to uppercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006257
6258static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006259unicode_upper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006260{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006261 return fixup(self, fixupper);
6262}
6263
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006264PyDoc_STRVAR(zfill__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006265"S.zfill(width) -> unicode\n\
6266\n\
6267Pad a numeric string x with zeros on the left, to fill a field\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006268of the specified width. The string x is never truncated.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006269
6270static PyObject *
6271unicode_zfill(PyUnicodeObject *self, PyObject *args)
6272{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006273 Py_ssize_t fill;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006274 PyUnicodeObject *u;
6275
Martin v. Löwis18e16552006-02-15 17:27:45 +00006276 Py_ssize_t width;
6277 if (!PyArg_ParseTuple(args, "n:zfill", &width))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006278 return NULL;
6279
6280 if (self->length >= width) {
Walter Dörwald0fe940c2002-04-15 18:42:15 +00006281 if (PyUnicode_CheckExact(self)) {
6282 Py_INCREF(self);
6283 return (PyObject*) self;
6284 }
6285 else
6286 return PyUnicode_FromUnicode(
6287 PyUnicode_AS_UNICODE(self),
6288 PyUnicode_GET_SIZE(self)
6289 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006290 }
6291
6292 fill = width - self->length;
6293
6294 u = pad(self, fill, 0, '0');
6295
Walter Dörwald068325e2002-04-15 13:36:47 +00006296 if (u == NULL)
6297 return NULL;
6298
Guido van Rossumd57fd912000-03-10 22:53:23 +00006299 if (u->str[fill] == '+' || u->str[fill] == '-') {
6300 /* move sign to beginning of string */
6301 u->str[0] = u->str[fill];
6302 u->str[fill] = '0';
6303 }
6304
6305 return (PyObject*) u;
6306}
Guido van Rossumd57fd912000-03-10 22:53:23 +00006307
6308#if 0
6309static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006310unicode_freelistsize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006311{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006312 return PyInt_FromLong(unicode_freelist_size);
6313}
6314#endif
6315
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006316PyDoc_STRVAR(startswith__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006317"S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006318\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00006319Return True if S starts with the specified prefix, False otherwise.\n\
6320With optional start, test S beginning at that position.\n\
6321With optional end, stop comparing S at that position.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006322
6323static PyObject *
6324unicode_startswith(PyUnicodeObject *self,
6325 PyObject *args)
6326{
6327 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006328 Py_ssize_t start = 0;
Martin v. Löwis412fb672006-04-13 06:34:32 +00006329 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006330 PyObject *result;
6331
Guido van Rossumb8872e62000-05-09 14:14:27 +00006332 if (!PyArg_ParseTuple(args, "O|O&O&:startswith", &substring,
6333 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006334 return NULL;
6335 substring = (PyUnicodeObject *)PyUnicode_FromObject(
6336 (PyObject *)substring);
6337 if (substring == NULL)
6338 return NULL;
6339
Guido van Rossum77f6a652002-04-03 22:41:51 +00006340 result = PyBool_FromLong(tailmatch(self, substring, start, end, -1));
Guido van Rossumd57fd912000-03-10 22:53:23 +00006341
6342 Py_DECREF(substring);
6343 return result;
6344}
6345
6346
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006347PyDoc_STRVAR(endswith__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006348"S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006349\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00006350Return True if S ends with the specified suffix, False otherwise.\n\
6351With optional start, test S beginning at that position.\n\
6352With optional end, stop comparing S at that position.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006353
6354static PyObject *
6355unicode_endswith(PyUnicodeObject *self,
6356 PyObject *args)
6357{
6358 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006359 Py_ssize_t start = 0;
Martin v. Löwis412fb672006-04-13 06:34:32 +00006360 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006361 PyObject *result;
6362
Guido van Rossumb8872e62000-05-09 14:14:27 +00006363 if (!PyArg_ParseTuple(args, "O|O&O&:endswith", &substring,
6364 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006365 return NULL;
6366 substring = (PyUnicodeObject *)PyUnicode_FromObject(
6367 (PyObject *)substring);
6368 if (substring == NULL)
6369 return NULL;
6370
Guido van Rossum77f6a652002-04-03 22:41:51 +00006371 result = PyBool_FromLong(tailmatch(self, substring, start, end, +1));
Guido van Rossumd57fd912000-03-10 22:53:23 +00006372
6373 Py_DECREF(substring);
6374 return result;
6375}
6376
6377
Guido van Rossum5d9113d2003-01-29 17:58:45 +00006378
6379static PyObject *
6380unicode_getnewargs(PyUnicodeObject *v)
6381{
6382 return Py_BuildValue("(u#)", v->str, v->length);
6383}
6384
6385
Guido van Rossumd57fd912000-03-10 22:53:23 +00006386static PyMethodDef unicode_methods[] = {
6387
6388 /* Order is according to common usage: often used methods should
6389 appear first, since lookup is done sequentially. */
6390
Georg Brandlecdc0a92006-03-30 12:19:07 +00006391 {"encode", (PyCFunction) unicode_encode, METH_VARARGS, encode__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006392 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
6393 {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006394 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006395 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
6396 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
6397 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
6398 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
6399 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
6400 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
6401 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
6402 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
6403 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
6404 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006405 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006406 {"decode", (PyCFunction) unicode_decode, METH_VARARGS, decode__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006407/* {"maketrans", (PyCFunction) unicode_maketrans, METH_VARARGS, maketrans__doc__}, */
6408 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
6409 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
6410 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006411 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006412 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS, splitlines__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006413 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006414 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
6415 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
6416 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
6417 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
6418 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
6419 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
6420 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
6421 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
6422 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
6423 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
6424 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
6425 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
6426 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
6427 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006428 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
Walter Dörwald068325e2002-04-15 13:36:47 +00006429#if 0
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006430 {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
Guido van Rossumd57fd912000-03-10 22:53:23 +00006431#endif
6432
6433#if 0
6434 /* This one is just used for debugging the implementation. */
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006435 {"freelistsize", (PyCFunction) unicode_freelistsize, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00006436#endif
6437
Guido van Rossum5d9113d2003-01-29 17:58:45 +00006438 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00006439 {NULL, NULL}
6440};
6441
Neil Schemenauerce30bc92002-11-18 16:10:18 +00006442static PyObject *
6443unicode_mod(PyObject *v, PyObject *w)
6444{
6445 if (!PyUnicode_Check(v)) {
6446 Py_INCREF(Py_NotImplemented);
6447 return Py_NotImplemented;
6448 }
6449 return PyUnicode_Format(v, w);
6450}
6451
6452static PyNumberMethods unicode_as_number = {
6453 0, /*nb_add*/
6454 0, /*nb_subtract*/
6455 0, /*nb_multiply*/
6456 0, /*nb_divide*/
6457 unicode_mod, /*nb_remainder*/
6458};
6459
Guido van Rossumd57fd912000-03-10 22:53:23 +00006460static PySequenceMethods unicode_as_sequence = {
Martin v. Löwis18e16552006-02-15 17:27:45 +00006461 (lenfunc) unicode_length, /* sq_length */
Georg Brandl347b3002006-03-30 11:57:00 +00006462 PyUnicode_Concat, /* sq_concat */
Martin v. Löwis18e16552006-02-15 17:27:45 +00006463 (ssizeargfunc) unicode_repeat, /* sq_repeat */
6464 (ssizeargfunc) unicode_getitem, /* sq_item */
6465 (ssizessizeargfunc) unicode_slice, /* sq_slice */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006466 0, /* sq_ass_item */
6467 0, /* sq_ass_slice */
Georg Brandl347b3002006-03-30 11:57:00 +00006468 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006469};
6470
Guido van Rossum38fff8c2006-03-07 18:50:55 +00006471#define HASINDEX(o) PyType_HasFeature((o)->ob_type, Py_TPFLAGS_HAVE_INDEX)
6472
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00006473static PyObject*
6474unicode_subscript(PyUnicodeObject* self, PyObject* item)
6475{
Guido van Rossum38fff8c2006-03-07 18:50:55 +00006476 PyNumberMethods *nb = item->ob_type->tp_as_number;
6477 if (nb != NULL && HASINDEX(item) && nb->nb_index != NULL) {
6478 Py_ssize_t i = nb->nb_index(item);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00006479 if (i == -1 && PyErr_Occurred())
6480 return NULL;
6481 if (i < 0)
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006482 i += PyUnicode_GET_SIZE(self);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00006483 return unicode_getitem(self, i);
6484 } else if (PySlice_Check(item)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00006485 Py_ssize_t start, stop, step, slicelength, cur, i;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00006486 Py_UNICODE* source_buf;
6487 Py_UNICODE* result_buf;
6488 PyObject* result;
6489
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006490 if (PySlice_GetIndicesEx((PySliceObject*)item, PyUnicode_GET_SIZE(self),
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00006491 &start, &stop, &step, &slicelength) < 0) {
6492 return NULL;
6493 }
6494
6495 if (slicelength <= 0) {
6496 return PyUnicode_FromUnicode(NULL, 0);
6497 } else {
6498 source_buf = PyUnicode_AS_UNICODE((PyObject*)self);
Anthony Baxtera6286212006-04-11 07:42:36 +00006499 result_buf = (Py_UNICODE *)PyMem_MALLOC(slicelength*
6500 sizeof(Py_UNICODE));
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006501
6502 if (result_buf == NULL)
6503 return PyErr_NoMemory();
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00006504
6505 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
6506 result_buf[i] = source_buf[cur];
6507 }
Tim Petersced69f82003-09-16 20:30:58 +00006508
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00006509 result = PyUnicode_FromUnicode(result_buf, slicelength);
6510 PyMem_FREE(result_buf);
6511 return result;
6512 }
6513 } else {
6514 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
6515 return NULL;
6516 }
6517}
6518
6519static PyMappingMethods unicode_as_mapping = {
Martin v. Löwis18e16552006-02-15 17:27:45 +00006520 (lenfunc)unicode_length, /* mp_length */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00006521 (binaryfunc)unicode_subscript, /* mp_subscript */
6522 (objobjargproc)0, /* mp_ass_subscript */
6523};
6524
Martin v. Löwis18e16552006-02-15 17:27:45 +00006525static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +00006526unicode_buffer_getreadbuf(PyUnicodeObject *self,
Martin v. Löwis18e16552006-02-15 17:27:45 +00006527 Py_ssize_t index,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006528 const void **ptr)
6529{
6530 if (index != 0) {
6531 PyErr_SetString(PyExc_SystemError,
6532 "accessing non-existent unicode segment");
6533 return -1;
6534 }
6535 *ptr = (void *) self->str;
6536 return PyUnicode_GET_DATA_SIZE(self);
6537}
6538
Martin v. Löwis18e16552006-02-15 17:27:45 +00006539static Py_ssize_t
6540unicode_buffer_getwritebuf(PyUnicodeObject *self, Py_ssize_t index,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006541 const void **ptr)
6542{
6543 PyErr_SetString(PyExc_TypeError,
Neal Norwitz20e72132002-06-13 21:25:17 +00006544 "cannot use unicode as modifiable buffer");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006545 return -1;
6546}
6547
6548static int
6549unicode_buffer_getsegcount(PyUnicodeObject *self,
Martin v. Löwis18e16552006-02-15 17:27:45 +00006550 Py_ssize_t *lenp)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006551{
6552 if (lenp)
6553 *lenp = PyUnicode_GET_DATA_SIZE(self);
6554 return 1;
6555}
6556
Martin v. Löwiseb079f12006-02-16 14:32:27 +00006557static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +00006558unicode_buffer_getcharbuf(PyUnicodeObject *self,
Martin v. Löwis18e16552006-02-15 17:27:45 +00006559 Py_ssize_t index,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006560 const void **ptr)
6561{
6562 PyObject *str;
Tim Petersced69f82003-09-16 20:30:58 +00006563
Guido van Rossumd57fd912000-03-10 22:53:23 +00006564 if (index != 0) {
6565 PyErr_SetString(PyExc_SystemError,
6566 "accessing non-existent unicode segment");
6567 return -1;
6568 }
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00006569 str = _PyUnicode_AsDefaultEncodedString((PyObject *)self, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006570 if (str == NULL)
6571 return -1;
6572 *ptr = (void *) PyString_AS_STRING(str);
6573 return PyString_GET_SIZE(str);
6574}
6575
6576/* Helpers for PyUnicode_Format() */
6577
6578static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +00006579getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006580{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006581 Py_ssize_t argidx = *p_argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006582 if (argidx < arglen) {
6583 (*p_argidx)++;
6584 if (arglen < 0)
6585 return args;
6586 else
6587 return PyTuple_GetItem(args, argidx);
6588 }
6589 PyErr_SetString(PyExc_TypeError,
6590 "not enough arguments for format string");
6591 return NULL;
6592}
6593
6594#define F_LJUST (1<<0)
6595#define F_SIGN (1<<1)
6596#define F_BLANK (1<<2)
6597#define F_ALT (1<<3)
6598#define F_ZERO (1<<4)
6599
Martin v. Löwis18e16552006-02-15 17:27:45 +00006600static Py_ssize_t
Neal Norwitzfc76d632006-01-10 06:03:13 +00006601strtounicode(Py_UNICODE *buffer, const char *charbuffer)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006602{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006603 register Py_ssize_t i;
6604 Py_ssize_t len = strlen(charbuffer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006605 for (i = len - 1; i >= 0; i--)
6606 buffer[i] = (Py_UNICODE) charbuffer[i];
6607
Guido van Rossumd57fd912000-03-10 22:53:23 +00006608 return len;
6609}
6610
Neal Norwitzfc76d632006-01-10 06:03:13 +00006611static int
6612doubletounicode(Py_UNICODE *buffer, size_t len, const char *format, double x)
6613{
Tim Peters15231542006-02-16 01:08:01 +00006614 Py_ssize_t result;
6615
Neal Norwitzfc76d632006-01-10 06:03:13 +00006616 PyOS_ascii_formatd((char *)buffer, len, format, x);
Tim Peters15231542006-02-16 01:08:01 +00006617 result = strtounicode(buffer, (char *)buffer);
6618 return Py_SAFE_DOWNCAST(result, Py_ssize_t, int);
Neal Norwitzfc76d632006-01-10 06:03:13 +00006619}
6620
6621static int
6622longtounicode(Py_UNICODE *buffer, size_t len, const char *format, long x)
6623{
Tim Peters15231542006-02-16 01:08:01 +00006624 Py_ssize_t result;
6625
Neal Norwitzfc76d632006-01-10 06:03:13 +00006626 PyOS_snprintf((char *)buffer, len, format, x);
Tim Peters15231542006-02-16 01:08:01 +00006627 result = strtounicode(buffer, (char *)buffer);
6628 return Py_SAFE_DOWNCAST(result, Py_ssize_t, int);
Neal Norwitzfc76d632006-01-10 06:03:13 +00006629}
6630
Guido van Rossum078151d2002-08-11 04:24:12 +00006631/* XXX To save some code duplication, formatfloat/long/int could have been
6632 shared with stringobject.c, converting from 8-bit to Unicode after the
6633 formatting is done. */
6634
Guido van Rossumd57fd912000-03-10 22:53:23 +00006635static int
6636formatfloat(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006637 size_t buflen,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006638 int flags,
6639 int prec,
6640 int type,
6641 PyObject *v)
6642{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006643 /* fmt = '%#.' + `prec` + `type`
6644 worst case length = 3 + 10 (len of INT_MAX) + 1 = 14 (use 20)*/
Guido van Rossumd57fd912000-03-10 22:53:23 +00006645 char fmt[20];
6646 double x;
Tim Petersced69f82003-09-16 20:30:58 +00006647
Guido van Rossumd57fd912000-03-10 22:53:23 +00006648 x = PyFloat_AsDouble(v);
6649 if (x == -1.0 && PyErr_Occurred())
6650 return -1;
6651 if (prec < 0)
6652 prec = 6;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006653 if (type == 'f' && (fabs(x) / 1e25) >= 1e25)
6654 type = 'g';
Marc-André Lemburg79f57832002-12-29 19:44:06 +00006655 /* Worst case length calc to ensure no buffer overrun:
6656
6657 'g' formats:
6658 fmt = %#.<prec>g
6659 buf = '-' + [0-9]*prec + '.' + 'e+' + (longest exp
6660 for any double rep.)
6661 len = 1 + prec + 1 + 2 + 5 = 9 + prec
6662
6663 'f' formats:
6664 buf = '-' + [0-9]*x + '.' + [0-9]*prec (with x < 50)
6665 len = 1 + 50 + 1 + prec = 52 + prec
6666
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006667 If prec=0 the effective precision is 1 (the leading digit is
Tim Petersced69f82003-09-16 20:30:58 +00006668 always given), therefore increase the length by one.
Marc-André Lemburg79f57832002-12-29 19:44:06 +00006669
6670 */
6671 if ((type == 'g' && buflen <= (size_t)10 + (size_t)prec) ||
6672 (type == 'f' && buflen <= (size_t)53 + (size_t)prec)) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006673 PyErr_SetString(PyExc_OverflowError,
Marc-André Lemburg79f57832002-12-29 19:44:06 +00006674 "formatted float is too long (precision too large?)");
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006675 return -1;
6676 }
Marc-André Lemburg79f57832002-12-29 19:44:06 +00006677 PyOS_snprintf(fmt, sizeof(fmt), "%%%s.%d%c",
6678 (flags&F_ALT) ? "#" : "",
6679 prec, type);
Neal Norwitzfc76d632006-01-10 06:03:13 +00006680 return doubletounicode(buf, buflen, fmt, x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006681}
6682
Tim Peters38fd5b62000-09-21 05:43:11 +00006683static PyObject*
6684formatlong(PyObject *val, int flags, int prec, int type)
6685{
6686 char *buf;
6687 int i, len;
6688 PyObject *str; /* temporary string object. */
6689 PyUnicodeObject *result;
6690
6691 str = _PyString_FormatLong(val, flags, prec, type, &buf, &len);
6692 if (!str)
6693 return NULL;
6694 result = _PyUnicode_New(len);
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00006695 if (!result) {
6696 Py_DECREF(str);
6697 return NULL;
6698 }
Tim Peters38fd5b62000-09-21 05:43:11 +00006699 for (i = 0; i < len; i++)
6700 result->str[i] = buf[i];
6701 result->str[len] = 0;
6702 Py_DECREF(str);
6703 return (PyObject*)result;
6704}
6705
Guido van Rossumd57fd912000-03-10 22:53:23 +00006706static int
6707formatint(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006708 size_t buflen,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006709 int flags,
6710 int prec,
6711 int type,
6712 PyObject *v)
6713{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006714 /* fmt = '%#.' + `prec` + 'l' + `type`
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006715 * worst case length = 3 + 19 (worst len of INT_MAX on 64-bit machine)
6716 * + 1 + 1
6717 * = 24
6718 */
Tim Peters38fd5b62000-09-21 05:43:11 +00006719 char fmt[64]; /* plenty big enough! */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00006720 char *sign;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006721 long x;
6722
6723 x = PyInt_AsLong(v);
6724 if (x == -1 && PyErr_Occurred())
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006725 return -1;
Guido van Rossum6c9e1302003-11-29 23:52:13 +00006726 if (x < 0 && type == 'u') {
6727 type = 'd';
Guido van Rossum078151d2002-08-11 04:24:12 +00006728 }
Guido van Rossum6c9e1302003-11-29 23:52:13 +00006729 if (x < 0 && (type == 'x' || type == 'X' || type == 'o'))
6730 sign = "-";
6731 else
6732 sign = "";
Guido van Rossumd57fd912000-03-10 22:53:23 +00006733 if (prec < 0)
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006734 prec = 1;
6735
Guido van Rossum6c9e1302003-11-29 23:52:13 +00006736 /* buf = '+'/'-'/'' + '0'/'0x'/'' + '[0-9]'*max(prec, len(x in octal))
6737 * worst case buf = '-0x' + [0-9]*prec, where prec >= 11
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006738 */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00006739 if (buflen <= 14 || buflen <= (size_t)3 + (size_t)prec) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006740 PyErr_SetString(PyExc_OverflowError,
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006741 "formatted integer is too long (precision too large?)");
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006742 return -1;
6743 }
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006744
6745 if ((flags & F_ALT) &&
6746 (type == 'x' || type == 'X')) {
Tim Petersced69f82003-09-16 20:30:58 +00006747 /* When converting under %#x or %#X, there are a number
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006748 * of issues that cause pain:
6749 * - when 0 is being converted, the C standard leaves off
6750 * the '0x' or '0X', which is inconsistent with other
6751 * %#x/%#X conversions and inconsistent with Python's
6752 * hex() function
6753 * - there are platforms that violate the standard and
6754 * convert 0 with the '0x' or '0X'
6755 * (Metrowerks, Compaq Tru64)
6756 * - there are platforms that give '0x' when converting
Tim Petersced69f82003-09-16 20:30:58 +00006757 * under %#X, but convert 0 in accordance with the
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006758 * standard (OS/2 EMX)
Tim Petersced69f82003-09-16 20:30:58 +00006759 *
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006760 * We can achieve the desired consistency by inserting our
6761 * own '0x' or '0X' prefix, and substituting %x/%X in place
6762 * of %#x/%#X.
6763 *
6764 * Note that this is the same approach as used in
6765 * formatint() in stringobject.c
Andrew MacIntyrec4874392002-02-26 11:36:35 +00006766 */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00006767 PyOS_snprintf(fmt, sizeof(fmt), "%s0%c%%.%dl%c",
6768 sign, type, prec, type);
Andrew MacIntyrec4874392002-02-26 11:36:35 +00006769 }
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006770 else {
Guido van Rossum6c9e1302003-11-29 23:52:13 +00006771 PyOS_snprintf(fmt, sizeof(fmt), "%s%%%s.%dl%c",
6772 sign, (flags&F_ALT) ? "#" : "",
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006773 prec, type);
Tim Petersb3d8d1f2001-04-28 05:38:26 +00006774 }
Guido van Rossum6c9e1302003-11-29 23:52:13 +00006775 if (sign[0])
Neal Norwitzfc76d632006-01-10 06:03:13 +00006776 return longtounicode(buf, buflen, fmt, -x);
Guido van Rossum6c9e1302003-11-29 23:52:13 +00006777 else
Neal Norwitzfc76d632006-01-10 06:03:13 +00006778 return longtounicode(buf, buflen, fmt, x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006779}
6780
6781static int
6782formatchar(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006783 size_t buflen,
6784 PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006785{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006786 /* presume that the buffer is at least 2 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00006787 if (PyUnicode_Check(v)) {
6788 if (PyUnicode_GET_SIZE(v) != 1)
6789 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006790 buf[0] = PyUnicode_AS_UNICODE(v)[0];
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00006791 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006792
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00006793 else if (PyString_Check(v)) {
Tim Petersced69f82003-09-16 20:30:58 +00006794 if (PyString_GET_SIZE(v) != 1)
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00006795 goto onError;
6796 buf[0] = (Py_UNICODE)PyString_AS_STRING(v)[0];
6797 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006798
6799 else {
6800 /* Integer input truncated to a character */
6801 long x;
6802 x = PyInt_AsLong(v);
6803 if (x == -1 && PyErr_Occurred())
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00006804 goto onError;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00006805#ifdef Py_UNICODE_WIDE
6806 if (x < 0 || x > 0x10ffff) {
Walter Dörwald44f527f2003-04-02 16:37:24 +00006807 PyErr_SetString(PyExc_OverflowError,
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00006808 "%c arg not in range(0x110000) "
6809 "(wide Python build)");
6810 return -1;
6811 }
6812#else
6813 if (x < 0 || x > 0xffff) {
Walter Dörwald44f527f2003-04-02 16:37:24 +00006814 PyErr_SetString(PyExc_OverflowError,
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00006815 "%c arg not in range(0x10000) "
6816 "(narrow Python build)");
6817 return -1;
6818 }
6819#endif
6820 buf[0] = (Py_UNICODE) x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006821 }
6822 buf[1] = '\0';
6823 return 1;
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00006824
6825 onError:
6826 PyErr_SetString(PyExc_TypeError,
6827 "%c requires int or char");
6828 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006829}
6830
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006831/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
6832
6833 FORMATBUFLEN is the length of the buffer in which the floats, ints, &
6834 chars are formatted. XXX This is a magic number. Each formatting
6835 routine does bounds checking to ensure no overflow, but a better
6836 solution may be to malloc a buffer of appropriate size for each
6837 format. For now, the current solution is sufficient.
6838*/
6839#define FORMATBUFLEN (size_t)120
6840
Guido van Rossumd57fd912000-03-10 22:53:23 +00006841PyObject *PyUnicode_Format(PyObject *format,
6842 PyObject *args)
6843{
6844 Py_UNICODE *fmt, *res;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006845 Py_ssize_t fmtcnt, rescnt, reslen, arglen, argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006846 int args_owned = 0;
6847 PyUnicodeObject *result = NULL;
6848 PyObject *dict = NULL;
6849 PyObject *uformat;
Tim Petersced69f82003-09-16 20:30:58 +00006850
Guido van Rossumd57fd912000-03-10 22:53:23 +00006851 if (format == NULL || args == NULL) {
6852 PyErr_BadInternalCall();
6853 return NULL;
6854 }
6855 uformat = PyUnicode_FromObject(format);
Fred Drakee4315f52000-05-09 19:53:39 +00006856 if (uformat == NULL)
6857 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006858 fmt = PyUnicode_AS_UNICODE(uformat);
6859 fmtcnt = PyUnicode_GET_SIZE(uformat);
6860
6861 reslen = rescnt = fmtcnt + 100;
6862 result = _PyUnicode_New(reslen);
6863 if (result == NULL)
6864 goto onError;
6865 res = PyUnicode_AS_UNICODE(result);
6866
6867 if (PyTuple_Check(args)) {
6868 arglen = PyTuple_Size(args);
6869 argidx = 0;
6870 }
6871 else {
6872 arglen = -1;
6873 argidx = -2;
6874 }
Neal Norwitz80a1bf42002-11-12 23:01:12 +00006875 if (args->ob_type->tp_as_mapping && !PyTuple_Check(args) &&
6876 !PyObject_TypeCheck(args, &PyBaseString_Type))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006877 dict = args;
6878
6879 while (--fmtcnt >= 0) {
6880 if (*fmt != '%') {
6881 if (--rescnt < 0) {
6882 rescnt = fmtcnt + 100;
6883 reslen += rescnt;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00006884 if (_PyUnicode_Resize(&result, reslen) < 0)
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00006885 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006886 res = PyUnicode_AS_UNICODE(result) + reslen - rescnt;
6887 --rescnt;
6888 }
6889 *res++ = *fmt++;
6890 }
6891 else {
6892 /* Got a format specifier */
6893 int flags = 0;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006894 Py_ssize_t width = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006895 int prec = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006896 Py_UNICODE c = '\0';
6897 Py_UNICODE fill;
6898 PyObject *v = NULL;
6899 PyObject *temp = NULL;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006900 Py_UNICODE *pbuf;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006901 Py_UNICODE sign;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006902 Py_ssize_t len;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006903 Py_UNICODE formatbuf[FORMATBUFLEN]; /* For format{float,int,char}() */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006904
6905 fmt++;
6906 if (*fmt == '(') {
6907 Py_UNICODE *keystart;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006908 Py_ssize_t keylen;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006909 PyObject *key;
6910 int pcount = 1;
6911
6912 if (dict == NULL) {
6913 PyErr_SetString(PyExc_TypeError,
Tim Petersced69f82003-09-16 20:30:58 +00006914 "format requires a mapping");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006915 goto onError;
6916 }
6917 ++fmt;
6918 --fmtcnt;
6919 keystart = fmt;
6920 /* Skip over balanced parentheses */
6921 while (pcount > 0 && --fmtcnt >= 0) {
6922 if (*fmt == ')')
6923 --pcount;
6924 else if (*fmt == '(')
6925 ++pcount;
6926 fmt++;
6927 }
6928 keylen = fmt - keystart - 1;
6929 if (fmtcnt < 0 || pcount > 0) {
6930 PyErr_SetString(PyExc_ValueError,
6931 "incomplete format key");
6932 goto onError;
6933 }
Marc-André Lemburg72f82132001-11-20 15:18:49 +00006934#if 0
Fred Drakee4315f52000-05-09 19:53:39 +00006935 /* keys are converted to strings using UTF-8 and
Guido van Rossumd57fd912000-03-10 22:53:23 +00006936 then looked up since Python uses strings to hold
6937 variables names etc. in its namespaces and we
Fred Drakee4315f52000-05-09 19:53:39 +00006938 wouldn't want to break common idioms. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006939 key = PyUnicode_EncodeUTF8(keystart,
6940 keylen,
6941 NULL);
Marc-André Lemburg72f82132001-11-20 15:18:49 +00006942#else
6943 key = PyUnicode_FromUnicode(keystart, keylen);
6944#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00006945 if (key == NULL)
6946 goto onError;
6947 if (args_owned) {
6948 Py_DECREF(args);
6949 args_owned = 0;
6950 }
6951 args = PyObject_GetItem(dict, key);
6952 Py_DECREF(key);
6953 if (args == NULL) {
6954 goto onError;
6955 }
6956 args_owned = 1;
6957 arglen = -1;
6958 argidx = -2;
6959 }
6960 while (--fmtcnt >= 0) {
6961 switch (c = *fmt++) {
6962 case '-': flags |= F_LJUST; continue;
6963 case '+': flags |= F_SIGN; continue;
6964 case ' ': flags |= F_BLANK; continue;
6965 case '#': flags |= F_ALT; continue;
6966 case '0': flags |= F_ZERO; continue;
6967 }
6968 break;
6969 }
6970 if (c == '*') {
6971 v = getnextarg(args, arglen, &argidx);
6972 if (v == NULL)
6973 goto onError;
6974 if (!PyInt_Check(v)) {
6975 PyErr_SetString(PyExc_TypeError,
6976 "* wants int");
6977 goto onError;
6978 }
6979 width = PyInt_AsLong(v);
6980 if (width < 0) {
6981 flags |= F_LJUST;
6982 width = -width;
6983 }
6984 if (--fmtcnt >= 0)
6985 c = *fmt++;
6986 }
6987 else if (c >= '0' && c <= '9') {
6988 width = c - '0';
6989 while (--fmtcnt >= 0) {
6990 c = *fmt++;
6991 if (c < '0' || c > '9')
6992 break;
6993 if ((width*10) / 10 != width) {
6994 PyErr_SetString(PyExc_ValueError,
6995 "width too big");
6996 goto onError;
6997 }
6998 width = width*10 + (c - '0');
6999 }
7000 }
7001 if (c == '.') {
7002 prec = 0;
7003 if (--fmtcnt >= 0)
7004 c = *fmt++;
7005 if (c == '*') {
7006 v = getnextarg(args, arglen, &argidx);
7007 if (v == NULL)
7008 goto onError;
7009 if (!PyInt_Check(v)) {
7010 PyErr_SetString(PyExc_TypeError,
7011 "* wants int");
7012 goto onError;
7013 }
7014 prec = PyInt_AsLong(v);
7015 if (prec < 0)
7016 prec = 0;
7017 if (--fmtcnt >= 0)
7018 c = *fmt++;
7019 }
7020 else if (c >= '0' && c <= '9') {
7021 prec = c - '0';
7022 while (--fmtcnt >= 0) {
7023 c = Py_CHARMASK(*fmt++);
7024 if (c < '0' || c > '9')
7025 break;
7026 if ((prec*10) / 10 != prec) {
7027 PyErr_SetString(PyExc_ValueError,
7028 "prec too big");
7029 goto onError;
7030 }
7031 prec = prec*10 + (c - '0');
7032 }
7033 }
7034 } /* prec */
7035 if (fmtcnt >= 0) {
7036 if (c == 'h' || c == 'l' || c == 'L') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007037 if (--fmtcnt >= 0)
7038 c = *fmt++;
7039 }
7040 }
7041 if (fmtcnt < 0) {
7042 PyErr_SetString(PyExc_ValueError,
7043 "incomplete format");
7044 goto onError;
7045 }
7046 if (c != '%') {
7047 v = getnextarg(args, arglen, &argidx);
7048 if (v == NULL)
7049 goto onError;
7050 }
7051 sign = 0;
7052 fill = ' ';
7053 switch (c) {
7054
7055 case '%':
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007056 pbuf = formatbuf;
7057 /* presume that buffer length is at least 1 */
7058 pbuf[0] = '%';
Guido van Rossumd57fd912000-03-10 22:53:23 +00007059 len = 1;
7060 break;
7061
7062 case 's':
7063 case 'r':
7064 if (PyUnicode_Check(v) && c == 's') {
7065 temp = v;
7066 Py_INCREF(temp);
7067 }
7068 else {
7069 PyObject *unicode;
7070 if (c == 's')
Marc-André Lemburgd25c6502004-07-23 16:13:25 +00007071 temp = PyObject_Unicode(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007072 else
7073 temp = PyObject_Repr(v);
7074 if (temp == NULL)
7075 goto onError;
Marc-André Lemburgd25c6502004-07-23 16:13:25 +00007076 if (PyUnicode_Check(temp))
7077 /* nothing to do */;
7078 else if (PyString_Check(temp)) {
7079 /* convert to string to Unicode */
Thomas Woutersa96affe2006-03-12 00:29:36 +00007080 unicode = PyUnicode_Decode(PyString_AS_STRING(temp),
Guido van Rossumd57fd912000-03-10 22:53:23 +00007081 PyString_GET_SIZE(temp),
Thomas Woutersa96affe2006-03-12 00:29:36 +00007082 NULL,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007083 "strict");
Thomas Woutersa96affe2006-03-12 00:29:36 +00007084 Py_DECREF(temp);
7085 temp = unicode;
7086 if (temp == NULL)
7087 goto onError;
7088 }
Marc-André Lemburgd25c6502004-07-23 16:13:25 +00007089 else {
7090 Py_DECREF(temp);
7091 PyErr_SetString(PyExc_TypeError,
7092 "%s argument has non-string str()");
7093 goto onError;
7094 }
7095 }
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007096 pbuf = PyUnicode_AS_UNICODE(temp);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007097 len = PyUnicode_GET_SIZE(temp);
7098 if (prec >= 0 && len > prec)
7099 len = prec;
7100 break;
7101
7102 case 'i':
7103 case 'd':
7104 case 'u':
7105 case 'o':
7106 case 'x':
7107 case 'X':
7108 if (c == 'i')
7109 c = 'd';
Tim Petersa3a3a032000-11-30 05:22:44 +00007110 if (PyLong_Check(v)) {
Tim Peters38fd5b62000-09-21 05:43:11 +00007111 temp = formatlong(v, flags, prec, c);
7112 if (!temp)
7113 goto onError;
7114 pbuf = PyUnicode_AS_UNICODE(temp);
7115 len = PyUnicode_GET_SIZE(temp);
Tim Peters38fd5b62000-09-21 05:43:11 +00007116 sign = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007117 }
Tim Peters38fd5b62000-09-21 05:43:11 +00007118 else {
7119 pbuf = formatbuf;
7120 len = formatint(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
7121 flags, prec, c, v);
7122 if (len < 0)
7123 goto onError;
Guido van Rossum6c9e1302003-11-29 23:52:13 +00007124 sign = 1;
Tim Peters38fd5b62000-09-21 05:43:11 +00007125 }
7126 if (flags & F_ZERO)
7127 fill = '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00007128 break;
7129
7130 case 'e':
7131 case 'E':
7132 case 'f':
Raymond Hettinger9bfe5332003-08-27 04:55:52 +00007133 case 'F':
Guido van Rossumd57fd912000-03-10 22:53:23 +00007134 case 'g':
7135 case 'G':
Raymond Hettinger9bfe5332003-08-27 04:55:52 +00007136 if (c == 'F')
7137 c = 'f';
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007138 pbuf = formatbuf;
7139 len = formatfloat(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
7140 flags, prec, c, v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007141 if (len < 0)
7142 goto onError;
7143 sign = 1;
Tim Peters38fd5b62000-09-21 05:43:11 +00007144 if (flags & F_ZERO)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007145 fill = '0';
7146 break;
7147
7148 case 'c':
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007149 pbuf = formatbuf;
7150 len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007151 if (len < 0)
7152 goto onError;
7153 break;
7154
7155 default:
7156 PyErr_Format(PyExc_ValueError,
Andrew M. Kuchling6ca89172000-12-15 13:07:46 +00007157 "unsupported format character '%c' (0x%x) "
7158 "at index %i",
Tim Petersced69f82003-09-16 20:30:58 +00007159 (31<=c && c<=126) ? (char)c : '?',
Marc-André Lemburg24e53b62002-09-24 09:32:14 +00007160 (int)c,
Guido van Rossumefc11882002-09-12 14:43:41 +00007161 (int)(fmt -1 - PyUnicode_AS_UNICODE(uformat)));
Guido van Rossumd57fd912000-03-10 22:53:23 +00007162 goto onError;
7163 }
7164 if (sign) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007165 if (*pbuf == '-' || *pbuf == '+') {
7166 sign = *pbuf++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007167 len--;
7168 }
7169 else if (flags & F_SIGN)
7170 sign = '+';
7171 else if (flags & F_BLANK)
7172 sign = ' ';
7173 else
7174 sign = 0;
7175 }
7176 if (width < len)
7177 width = len;
Guido van Rossum049cd6b2002-10-11 00:43:48 +00007178 if (rescnt - (sign != 0) < width) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007179 reslen -= rescnt;
7180 rescnt = width + fmtcnt + 100;
7181 reslen += rescnt;
Guido van Rossum049cd6b2002-10-11 00:43:48 +00007182 if (reslen < 0) {
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00007183 Py_XDECREF(temp);
Thomas Woutersa96affe2006-03-12 00:29:36 +00007184 PyErr_NoMemory();
7185 goto onError;
Guido van Rossum049cd6b2002-10-11 00:43:48 +00007186 }
Thomas Woutersa96affe2006-03-12 00:29:36 +00007187 if (_PyUnicode_Resize(&result, reslen) < 0) {
7188 Py_XDECREF(temp);
7189 goto onError;
7190 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007191 res = PyUnicode_AS_UNICODE(result)
7192 + reslen - rescnt;
7193 }
7194 if (sign) {
7195 if (fill != ' ')
7196 *res++ = sign;
7197 rescnt--;
7198 if (width > len)
7199 width--;
7200 }
Tim Peters38fd5b62000-09-21 05:43:11 +00007201 if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
7202 assert(pbuf[0] == '0');
Tim Petersfff53252001-04-12 18:38:48 +00007203 assert(pbuf[1] == c);
7204 if (fill != ' ') {
7205 *res++ = *pbuf++;
7206 *res++ = *pbuf++;
Tim Peters38fd5b62000-09-21 05:43:11 +00007207 }
Tim Petersfff53252001-04-12 18:38:48 +00007208 rescnt -= 2;
7209 width -= 2;
7210 if (width < 0)
7211 width = 0;
7212 len -= 2;
Tim Peters38fd5b62000-09-21 05:43:11 +00007213 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007214 if (width > len && !(flags & F_LJUST)) {
7215 do {
7216 --rescnt;
7217 *res++ = fill;
7218 } while (--width > len);
7219 }
Tim Peters38fd5b62000-09-21 05:43:11 +00007220 if (fill == ' ') {
7221 if (sign)
7222 *res++ = sign;
Tim Petersfff53252001-04-12 18:38:48 +00007223 if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
Tim Peters38fd5b62000-09-21 05:43:11 +00007224 assert(pbuf[0] == '0');
Tim Petersfff53252001-04-12 18:38:48 +00007225 assert(pbuf[1] == c);
Tim Peters38fd5b62000-09-21 05:43:11 +00007226 *res++ = *pbuf++;
7227 *res++ = *pbuf++;
7228 }
7229 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00007230 Py_UNICODE_COPY(res, pbuf, len);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007231 res += len;
7232 rescnt -= len;
7233 while (--width >= len) {
7234 --rescnt;
7235 *res++ = ' ';
7236 }
7237 if (dict && (argidx < arglen) && c != '%') {
7238 PyErr_SetString(PyExc_TypeError,
Raymond Hettinger0ebac972002-05-21 15:14:57 +00007239 "not all arguments converted during string formatting");
Thomas Woutersa96affe2006-03-12 00:29:36 +00007240 Py_XDECREF(temp);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007241 goto onError;
7242 }
7243 Py_XDECREF(temp);
7244 } /* '%' */
7245 } /* until end */
7246 if (argidx < arglen && !dict) {
7247 PyErr_SetString(PyExc_TypeError,
Raymond Hettinger0ebac972002-05-21 15:14:57 +00007248 "not all arguments converted during string formatting");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007249 goto onError;
7250 }
7251
Thomas Woutersa96affe2006-03-12 00:29:36 +00007252 if (_PyUnicode_Resize(&result, reslen - rescnt) < 0)
7253 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007254 if (args_owned) {
7255 Py_DECREF(args);
7256 }
7257 Py_DECREF(uformat);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007258 return (PyObject *)result;
7259
7260 onError:
7261 Py_XDECREF(result);
7262 Py_DECREF(uformat);
7263 if (args_owned) {
7264 Py_DECREF(args);
7265 }
7266 return NULL;
7267}
7268
7269static PyBufferProcs unicode_as_buffer = {
Martin v. Löwis18e16552006-02-15 17:27:45 +00007270 (readbufferproc) unicode_buffer_getreadbuf,
7271 (writebufferproc) unicode_buffer_getwritebuf,
7272 (segcountproc) unicode_buffer_getsegcount,
7273 (charbufferproc) unicode_buffer_getcharbuf,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007274};
7275
Jeremy Hylton938ace62002-07-17 16:30:39 +00007276static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +00007277unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
7278
Tim Peters6d6c1a32001-08-02 04:15:00 +00007279static PyObject *
7280unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
7281{
7282 PyObject *x = NULL;
Martin v. Löwis15e62742006-02-27 16:46:16 +00007283 static char *kwlist[] = {"string", "encoding", "errors", 0};
Tim Peters6d6c1a32001-08-02 04:15:00 +00007284 char *encoding = NULL;
7285 char *errors = NULL;
7286
Guido van Rossume023fe02001-08-30 03:12:59 +00007287 if (type != &PyUnicode_Type)
7288 return unicode_subtype_new(type, args, kwds);
Tim Peters6d6c1a32001-08-02 04:15:00 +00007289 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:unicode",
7290 kwlist, &x, &encoding, &errors))
7291 return NULL;
7292 if (x == NULL)
7293 return (PyObject *)_PyUnicode_New(0);
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00007294 if (encoding == NULL && errors == NULL)
7295 return PyObject_Unicode(x);
7296 else
Tim Peters6d6c1a32001-08-02 04:15:00 +00007297 return PyUnicode_FromEncodedObject(x, encoding, errors);
7298}
7299
Guido van Rossume023fe02001-08-30 03:12:59 +00007300static PyObject *
7301unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
7302{
Tim Petersaf90b3e2001-09-12 05:18:58 +00007303 PyUnicodeObject *tmp, *pnew;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007304 Py_ssize_t n;
Guido van Rossume023fe02001-08-30 03:12:59 +00007305
7306 assert(PyType_IsSubtype(type, &PyUnicode_Type));
7307 tmp = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds);
7308 if (tmp == NULL)
7309 return NULL;
7310 assert(PyUnicode_Check(tmp));
Tim Petersaf90b3e2001-09-12 05:18:58 +00007311 pnew = (PyUnicodeObject *) type->tp_alloc(type, n = tmp->length);
Raymond Hettingerf4667932003-06-28 20:04:25 +00007312 if (pnew == NULL) {
7313 Py_DECREF(tmp);
Guido van Rossume023fe02001-08-30 03:12:59 +00007314 return NULL;
Raymond Hettingerf4667932003-06-28 20:04:25 +00007315 }
Tim Petersaf90b3e2001-09-12 05:18:58 +00007316 pnew->str = PyMem_NEW(Py_UNICODE, n+1);
7317 if (pnew->str == NULL) {
7318 _Py_ForgetReference((PyObject *)pnew);
Neil Schemenauer58aa8612002-04-12 03:07:20 +00007319 PyObject_Del(pnew);
Raymond Hettingerf4667932003-06-28 20:04:25 +00007320 Py_DECREF(tmp);
Neal Norwitzec74f2f2003-02-11 23:05:40 +00007321 return PyErr_NoMemory();
Guido van Rossume023fe02001-08-30 03:12:59 +00007322 }
Tim Petersaf90b3e2001-09-12 05:18:58 +00007323 Py_UNICODE_COPY(pnew->str, tmp->str, n+1);
7324 pnew->length = n;
7325 pnew->hash = tmp->hash;
Guido van Rossume023fe02001-08-30 03:12:59 +00007326 Py_DECREF(tmp);
Tim Petersaf90b3e2001-09-12 05:18:58 +00007327 return (PyObject *)pnew;
Guido van Rossume023fe02001-08-30 03:12:59 +00007328}
7329
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007330PyDoc_STRVAR(unicode_doc,
Tim Peters6d6c1a32001-08-02 04:15:00 +00007331"unicode(string [, encoding[, errors]]) -> object\n\
7332\n\
7333Create a new Unicode object from the given encoded string.\n\
Skip Montanaro35b37a52002-07-26 16:22:46 +00007334encoding defaults to the current default string encoding.\n\
7335errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +00007336
Guido van Rossumd57fd912000-03-10 22:53:23 +00007337PyTypeObject PyUnicode_Type = {
7338 PyObject_HEAD_INIT(&PyType_Type)
7339 0, /* ob_size */
7340 "unicode", /* tp_name */
7341 sizeof(PyUnicodeObject), /* tp_size */
7342 0, /* tp_itemsize */
7343 /* Slots */
Guido van Rossum9475a232001-10-05 20:51:39 +00007344 (destructor)unicode_dealloc, /* tp_dealloc */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007345 0, /* tp_print */
Tim Peters6d6c1a32001-08-02 04:15:00 +00007346 0, /* tp_getattr */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007347 0, /* tp_setattr */
7348 (cmpfunc) unicode_compare, /* tp_compare */
Georg Brandl347b3002006-03-30 11:57:00 +00007349 unicode_repr, /* tp_repr */
Neil Schemenauerce30bc92002-11-18 16:10:18 +00007350 &unicode_as_number, /* tp_as_number */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007351 &unicode_as_sequence, /* tp_as_sequence */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007352 &unicode_as_mapping, /* tp_as_mapping */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007353 (hashfunc) unicode_hash, /* tp_hash*/
7354 0, /* tp_call*/
7355 (reprfunc) unicode_str, /* tp_str */
Tim Peters6d6c1a32001-08-02 04:15:00 +00007356 PyObject_GenericGetAttr, /* tp_getattro */
7357 0, /* tp_setattro */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007358 &unicode_as_buffer, /* tp_as_buffer */
Neil Schemenauerce30bc92002-11-18 16:10:18 +00007359 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_CHECKTYPES |
7360 Py_TPFLAGS_BASETYPE, /* tp_flags */
Tim Peters6d6c1a32001-08-02 04:15:00 +00007361 unicode_doc, /* tp_doc */
7362 0, /* tp_traverse */
7363 0, /* tp_clear */
7364 0, /* tp_richcompare */
7365 0, /* tp_weaklistoffset */
7366 0, /* tp_iter */
7367 0, /* tp_iternext */
7368 unicode_methods, /* tp_methods */
7369 0, /* tp_members */
7370 0, /* tp_getset */
Guido van Rossumcacfc072002-05-24 19:01:59 +00007371 &PyBaseString_Type, /* tp_base */
Tim Peters6d6c1a32001-08-02 04:15:00 +00007372 0, /* tp_dict */
7373 0, /* tp_descr_get */
7374 0, /* tp_descr_set */
7375 0, /* tp_dictoffset */
7376 0, /* tp_init */
7377 0, /* tp_alloc */
7378 unicode_new, /* tp_new */
Neil Schemenauer58aa8612002-04-12 03:07:20 +00007379 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007380};
7381
7382/* Initialize the Unicode implementation */
7383
Thomas Wouters78890102000-07-22 19:25:51 +00007384void _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007385{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00007386 int i;
7387
Fred Drakee4315f52000-05-09 19:53:39 +00007388 /* Init the implementation */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00007389 unicode_freelist = NULL;
7390 unicode_freelist_size = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007391 unicode_empty = _PyUnicode_New(0);
Marc-André Lemburg90e81472000-06-07 09:13:21 +00007392 strcpy(unicode_default_encoding, "ascii");
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00007393 for (i = 0; i < 256; i++)
7394 unicode_latin1[i] = NULL;
Guido van Rossumcacfc072002-05-24 19:01:59 +00007395 if (PyType_Ready(&PyUnicode_Type) < 0)
7396 Py_FatalError("Can't initialize 'unicode'");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007397}
7398
7399/* Finalize the Unicode implementation */
7400
7401void
Thomas Wouters78890102000-07-22 19:25:51 +00007402_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007403{
Barry Warsaw5b4c2282000-10-03 20:45:26 +00007404 PyUnicodeObject *u;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00007405 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007406
Guido van Rossum4ae8ef82000-10-03 18:09:04 +00007407 Py_XDECREF(unicode_empty);
7408 unicode_empty = NULL;
Barry Warsaw5b4c2282000-10-03 20:45:26 +00007409
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00007410 for (i = 0; i < 256; i++) {
7411 if (unicode_latin1[i]) {
7412 Py_DECREF(unicode_latin1[i]);
7413 unicode_latin1[i] = NULL;
7414 }
7415 }
7416
Barry Warsaw5b4c2282000-10-03 20:45:26 +00007417 for (u = unicode_freelist; u != NULL;) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007418 PyUnicodeObject *v = u;
7419 u = *(PyUnicodeObject **)u;
Guido van Rossumfd4b9572000-04-10 13:51:10 +00007420 if (v->str)
Guido van Rossumb18618d2000-05-03 23:44:39 +00007421 PyMem_DEL(v->str);
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00007422 Py_XDECREF(v->defenc);
Neil Schemenauer58aa8612002-04-12 03:07:20 +00007423 PyObject_Del(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007424 }
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00007425 unicode_freelist = NULL;
7426 unicode_freelist_size = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007427}
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00007428
Anthony Baxterac6bd462006-04-13 02:06:09 +00007429#ifdef __cplusplus
7430}
7431#endif
7432
7433
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00007434/*
7435Local variables:
7436c-basic-offset: 4
7437indent-tabs-mode: nil
7438End:
7439*/