blob: 26d6fc64eeef689c82206f99eeefbadc2659db91 [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Fred Drake785d14f2000-05-09 19:54:43 +00004modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
Guido van Rossumd57fd912000-03-10 22:53:23 +00005Unicode Integration Proposal (see file Misc/unicode.txt).
6
Thomas Wouters477c8d52006-05-27 19:21:47 +00007Major speed upgrades to the method implementations at the Reykjavik
8NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
9
Guido van Rossum16b1ad92000-08-03 16:24:25 +000010Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000011
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000012--------------------------------------------------------------------
13The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000014
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000015 Copyright (c) 1999 by Secret Labs AB
16 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000017
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000018By obtaining, using, and/or copying this software and/or its
19associated documentation, you agree that you have read, understood,
20and will comply with the following terms and conditions:
21
22Permission to use, copy, modify, and distribute this software and its
23associated documentation for any purpose and without fee is hereby
24granted, provided that the above copyright notice appears in all
25copies, and that both that copyright notice and this permission notice
26appear in supporting documentation, and that the name of Secret Labs
27AB or the author not be used in advertising or publicity pertaining to
28distribution of the software without specific, written prior
29permission.
30
31SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
32THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
33FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
34ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
35WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
36ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
37OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
38--------------------------------------------------------------------
39
40*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000041
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000042#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000043#include "Python.h"
44
Guido van Rossumd57fd912000-03-10 22:53:23 +000045#include "unicodeobject.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000046#include "ucnhash.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000047
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000048#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000049#include <windows.h>
50#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000051
Guido van Rossumd57fd912000-03-10 22:53:23 +000052/* Limit for the Unicode object free list */
53
54#define MAX_UNICODE_FREELIST_SIZE 1024
55
56/* Limit for the Unicode object free list stay alive optimization.
57
58 The implementation will keep allocated Unicode memory intact for
59 all objects on the free list having a size less than this
Tim Petersced69f82003-09-16 20:30:58 +000060 limit. This reduces malloc() overhead for small Unicode objects.
Guido van Rossumd57fd912000-03-10 22:53:23 +000061
Barry Warsaw51ac5802000-03-20 16:36:48 +000062 At worst this will result in MAX_UNICODE_FREELIST_SIZE *
Guido van Rossumfd4b9572000-04-10 13:51:10 +000063 (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
Guido van Rossumd57fd912000-03-10 22:53:23 +000064 malloc()-overhead) bytes of unused garbage.
65
66 Setting the limit to 0 effectively turns the feature off.
67
Guido van Rossumfd4b9572000-04-10 13:51:10 +000068 Note: This is an experimental feature ! If you get core dumps when
69 using Unicode objects, turn this feature off.
Guido van Rossumd57fd912000-03-10 22:53:23 +000070
71*/
72
Guido van Rossumfd4b9572000-04-10 13:51:10 +000073#define KEEPALIVE_SIZE_LIMIT 9
Guido van Rossumd57fd912000-03-10 22:53:23 +000074
75/* Endianness switches; defaults to little endian */
76
77#ifdef WORDS_BIGENDIAN
78# define BYTEORDER_IS_BIG_ENDIAN
79#else
80# define BYTEORDER_IS_LITTLE_ENDIAN
81#endif
82
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000083/* --- Globals ------------------------------------------------------------
84
85 The globals are initialized by the _PyUnicode_Init() API and should
86 not be used before calling that API.
87
88*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000089
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000090
91#ifdef __cplusplus
92extern "C" {
93#endif
94
Guido van Rossumd57fd912000-03-10 22:53:23 +000095/* Free list for Unicode objects */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000096static PyUnicodeObject *unicode_freelist;
97static int unicode_freelist_size;
Guido van Rossumd57fd912000-03-10 22:53:23 +000098
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000099/* The empty Unicode object is shared to improve performance. */
100static PyUnicodeObject *unicode_empty;
101
102/* Single character Unicode strings in the Latin-1 range are being
103 shared as well. */
104static PyUnicodeObject *unicode_latin1[256];
105
Fred Drakee4315f52000-05-09 19:53:39 +0000106/* Default encoding to use and assume when NULL is passed as encoding
Guido van Rossumf15a29f2007-05-04 00:41:39 +0000107 parameter; it is fixed to "utf-8". Always use the
108 PyUnicode_GetDefaultEncoding() API to access this global. */
109static const char unicode_default_encoding[] = "utf-8";
Fred Drakee4315f52000-05-09 19:53:39 +0000110
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000111Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000112PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000113{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000114#ifdef Py_UNICODE_WIDE
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000115 return 0x10FFFF;
116#else
117 /* This is actually an illegal character, so it should
118 not be passed to unichr. */
119 return 0xFFFF;
120#endif
121}
122
Thomas Wouters477c8d52006-05-27 19:21:47 +0000123/* --- Bloom Filters ----------------------------------------------------- */
124
125/* stuff to implement simple "bloom filters" for Unicode characters.
126 to keep things simple, we use a single bitmask, using the least 5
127 bits from each unicode characters as the bit index. */
128
129/* the linebreak mask is set up by Unicode_Init below */
130
131#define BLOOM_MASK unsigned long
132
133static BLOOM_MASK bloom_linebreak;
134
135#define BLOOM(mask, ch) ((mask & (1 << ((ch) & 0x1F))))
136
137#define BLOOM_LINEBREAK(ch)\
138 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK((ch)))
139
140Py_LOCAL_INLINE(BLOOM_MASK) make_bloom_mask(Py_UNICODE* ptr, Py_ssize_t len)
141{
142 /* calculate simple bloom-style bitmask for a given unicode string */
143
144 long mask;
145 Py_ssize_t i;
146
147 mask = 0;
148 for (i = 0; i < len; i++)
149 mask |= (1 << (ptr[i] & 0x1F));
150
151 return mask;
152}
153
154Py_LOCAL_INLINE(int) unicode_member(Py_UNICODE chr, Py_UNICODE* set, Py_ssize_t setlen)
155{
156 Py_ssize_t i;
157
158 for (i = 0; i < setlen; i++)
159 if (set[i] == chr)
160 return 1;
161
162 return 0;
163}
164
165#define BLOOM_MEMBER(mask, chr, set, setlen)\
166 BLOOM(mask, chr) && unicode_member(chr, set, setlen)
167
Guido van Rossumd57fd912000-03-10 22:53:23 +0000168/* --- Unicode Object ----------------------------------------------------- */
169
170static
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000171int unicode_resize(register PyUnicodeObject *unicode,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000172 Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000173{
174 void *oldstr;
Tim Petersced69f82003-09-16 20:30:58 +0000175
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000176 /* Shortcut if there's nothing much to do. */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000177 if (unicode->length == length)
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000178 goto reset;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000179
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000180 /* Resizing shared object (unicode_empty or single character
181 objects) in-place is not allowed. Use PyUnicode_Resize()
182 instead ! */
Thomas Wouters477c8d52006-05-27 19:21:47 +0000183
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000184 if (unicode == unicode_empty ||
185 (unicode->length == 1 &&
186 unicode->str[0] < 256U &&
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000187 unicode_latin1[unicode->str[0]] == unicode)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000188 PyErr_SetString(PyExc_SystemError,
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000189 "can't resize shared unicode objects");
Guido van Rossumd57fd912000-03-10 22:53:23 +0000190 return -1;
191 }
192
Thomas Wouters477c8d52006-05-27 19:21:47 +0000193 /* We allocate one more byte to make sure the string is Ux0000 terminated.
194 The overallocation is also used by fastsearch, which assumes that it's
195 safe to look at str[length] (without making any assumptions about what
196 it contains). */
197
Guido van Rossumd57fd912000-03-10 22:53:23 +0000198 oldstr = unicode->str;
199 PyMem_RESIZE(unicode->str, Py_UNICODE, length + 1);
200 if (!unicode->str) {
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000201 unicode->str = (Py_UNICODE *)oldstr;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000202 PyErr_NoMemory();
203 return -1;
204 }
205 unicode->str[length] = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000206 unicode->length = length;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000207
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000208 reset:
Guido van Rossumd57fd912000-03-10 22:53:23 +0000209 /* Reset the object caches */
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000210 if (unicode->defenc) {
211 Py_DECREF(unicode->defenc);
212 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000213 }
214 unicode->hash = -1;
Tim Petersced69f82003-09-16 20:30:58 +0000215
Guido van Rossumd57fd912000-03-10 22:53:23 +0000216 return 0;
217}
218
219/* We allocate one more byte to make sure the string is
Tim Petersced69f82003-09-16 20:30:58 +0000220 Ux0000 terminated -- XXX is this needed ?
Guido van Rossumd57fd912000-03-10 22:53:23 +0000221
222 XXX This allocator could further be enhanced by assuring that the
223 free list never reduces its size below 1.
224
225*/
226
227static
Martin v. Löwis18e16552006-02-15 17:27:45 +0000228PyUnicodeObject *_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000229{
230 register PyUnicodeObject *unicode;
231
Thomas Wouters477c8d52006-05-27 19:21:47 +0000232 /* Optimization for empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000233 if (length == 0 && unicode_empty != NULL) {
234 Py_INCREF(unicode_empty);
235 return unicode_empty;
236 }
237
238 /* Unicode freelist & memory allocation */
239 if (unicode_freelist) {
240 unicode = unicode_freelist;
Marc-André Lemburgbea47e72000-06-17 20:31:17 +0000241 unicode_freelist = *(PyUnicodeObject **)unicode;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000242 unicode_freelist_size--;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000243 if (unicode->str) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000244 /* Keep-Alive optimization: we only upsize the buffer,
245 never downsize it. */
Tim Petersced69f82003-09-16 20:30:58 +0000246 if ((unicode->length < length) &&
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +0000247 unicode_resize(unicode, length) < 0) {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000248 PyMem_DEL(unicode->str);
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000249 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000250 }
251 }
Guido van Rossumad98db12001-06-14 17:52:02 +0000252 else {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000253 unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
Guido van Rossumad98db12001-06-14 17:52:02 +0000254 }
255 PyObject_INIT(unicode, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000256 }
257 else {
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000258 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000259 if (unicode == NULL)
260 return NULL;
261 unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
262 }
263
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000264 if (!unicode->str) {
265 PyErr_NoMemory();
Barry Warsaw51ac5802000-03-20 16:36:48 +0000266 goto onError;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000267 }
Jeremy Hyltond8082792003-09-16 19:41:39 +0000268 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +0000269 * the caller fails before initializing str -- unicode_resize()
270 * reads str[0], and the Keep-Alive optimization can keep memory
271 * allocated for str alive across a call to unicode_dealloc(unicode).
272 * We don't want unicode_resize to read uninitialized memory in
273 * that case.
274 */
Jeremy Hyltond8082792003-09-16 19:41:39 +0000275 unicode->str[0] = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000276 unicode->str[length] = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000277 unicode->length = length;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000278 unicode->hash = -1;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000279 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000280 return unicode;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000281
282 onError:
283 _Py_ForgetReference((PyObject *)unicode);
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000284 PyObject_Del(unicode);
Barry Warsaw51ac5802000-03-20 16:36:48 +0000285 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000286}
287
288static
Guido van Rossum9475a232001-10-05 20:51:39 +0000289void unicode_dealloc(register PyUnicodeObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000290{
Guido van Rossum604ddf82001-12-06 20:03:56 +0000291 if (PyUnicode_CheckExact(unicode) &&
292 unicode_freelist_size < MAX_UNICODE_FREELIST_SIZE) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000293 /* Keep-Alive optimization */
294 if (unicode->length >= KEEPALIVE_SIZE_LIMIT) {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000295 PyMem_DEL(unicode->str);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000296 unicode->str = NULL;
297 unicode->length = 0;
298 }
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000299 if (unicode->defenc) {
300 Py_DECREF(unicode->defenc);
301 unicode->defenc = NULL;
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000302 }
303 /* Add to free list */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000304 *(PyUnicodeObject **)unicode = unicode_freelist;
305 unicode_freelist = unicode;
306 unicode_freelist_size++;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000307 }
308 else {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000309 PyMem_DEL(unicode->str);
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000310 Py_XDECREF(unicode->defenc);
Guido van Rossum604ddf82001-12-06 20:03:56 +0000311 unicode->ob_type->tp_free((PyObject *)unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000312 }
313}
314
Martin v. Löwis18e16552006-02-15 17:27:45 +0000315int PyUnicode_Resize(PyObject **unicode, Py_ssize_t length)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000316{
317 register PyUnicodeObject *v;
318
319 /* Argument checks */
320 if (unicode == NULL) {
321 PyErr_BadInternalCall();
322 return -1;
323 }
324 v = (PyUnicodeObject *)*unicode;
Guido van Rossum049cd6b2002-10-11 00:43:48 +0000325 if (v == NULL || !PyUnicode_Check(v) || v->ob_refcnt != 1 || length < 0) {
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000326 PyErr_BadInternalCall();
327 return -1;
328 }
329
330 /* Resizing unicode_empty and single character objects is not
331 possible since these are being shared. We simply return a fresh
332 copy with the same Unicode content. */
Tim Petersced69f82003-09-16 20:30:58 +0000333 if (v->length != length &&
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000334 (v == unicode_empty || v->length == 1)) {
335 PyUnicodeObject *w = _PyUnicode_New(length);
336 if (w == NULL)
337 return -1;
338 Py_UNICODE_COPY(w->str, v->str,
339 length < v->length ? length : v->length);
Raymond Hettingerc8df5782003-03-09 07:30:43 +0000340 Py_DECREF(*unicode);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000341 *unicode = (PyObject *)w;
342 return 0;
343 }
344
345 /* Note that we don't have to modify *unicode for unshared Unicode
346 objects, since we can modify them in-place. */
347 return unicode_resize(v, length);
348}
349
350/* Internal API for use in unicodeobject.c only ! */
351#define _PyUnicode_Resize(unicodevar, length) \
352 PyUnicode_Resize(((PyObject **)(unicodevar)), length)
353
Guido van Rossumd57fd912000-03-10 22:53:23 +0000354PyObject *PyUnicode_FromUnicode(const Py_UNICODE *u,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000355 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000356{
357 PyUnicodeObject *unicode;
358
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000359 /* If the Unicode data is known at construction time, we can apply
360 some optimizations which share commonly used objects. */
361 if (u != NULL) {
362
363 /* Optimization for empty strings */
364 if (size == 0 && unicode_empty != NULL) {
365 Py_INCREF(unicode_empty);
366 return (PyObject *)unicode_empty;
367 }
368
369 /* Single character Unicode objects in the Latin-1 range are
370 shared when using this constructor */
371 if (size == 1 && *u < 256) {
372 unicode = unicode_latin1[*u];
373 if (!unicode) {
374 unicode = _PyUnicode_New(1);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000375 if (!unicode)
376 return NULL;
Marc-André Lemburg8879a332001-06-07 12:26:56 +0000377 unicode->str[0] = *u;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000378 unicode_latin1[*u] = unicode;
379 }
380 Py_INCREF(unicode);
381 return (PyObject *)unicode;
382 }
383 }
Tim Petersced69f82003-09-16 20:30:58 +0000384
Guido van Rossumd57fd912000-03-10 22:53:23 +0000385 unicode = _PyUnicode_New(size);
386 if (!unicode)
387 return NULL;
388
389 /* Copy the Unicode data into the new object */
390 if (u != NULL)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000391 Py_UNICODE_COPY(unicode->str, u, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000392
393 return (PyObject *)unicode;
394}
395
396#ifdef HAVE_WCHAR_H
397
398PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000399 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000400{
401 PyUnicodeObject *unicode;
402
403 if (w == NULL) {
404 PyErr_BadInternalCall();
405 return NULL;
406 }
407
408 unicode = _PyUnicode_New(size);
409 if (!unicode)
410 return NULL;
411
412 /* Copy the wchar_t data into the new object */
413#ifdef HAVE_USABLE_WCHAR_T
414 memcpy(unicode->str, w, size * sizeof(wchar_t));
Tim Petersced69f82003-09-16 20:30:58 +0000415#else
Guido van Rossumd57fd912000-03-10 22:53:23 +0000416 {
417 register Py_UNICODE *u;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000418 register Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000419 u = PyUnicode_AS_UNICODE(unicode);
Marc-André Lemburg204bd6d2004-10-15 07:45:05 +0000420 for (i = size; i > 0; i--)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000421 *u++ = *w++;
422 }
423#endif
424
425 return (PyObject *)unicode;
426}
427
Martin v. Löwis18e16552006-02-15 17:27:45 +0000428Py_ssize_t PyUnicode_AsWideChar(PyUnicodeObject *unicode,
429 wchar_t *w,
430 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000431{
432 if (unicode == NULL) {
433 PyErr_BadInternalCall();
434 return -1;
435 }
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +0000436
437 /* If possible, try to copy the 0-termination as well */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000438 if (size > PyUnicode_GET_SIZE(unicode))
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +0000439 size = PyUnicode_GET_SIZE(unicode) + 1;
440
Guido van Rossumd57fd912000-03-10 22:53:23 +0000441#ifdef HAVE_USABLE_WCHAR_T
442 memcpy(w, unicode->str, size * sizeof(wchar_t));
443#else
444 {
445 register Py_UNICODE *u;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000446 register Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000447 u = PyUnicode_AS_UNICODE(unicode);
Marc-André Lemburg204bd6d2004-10-15 07:45:05 +0000448 for (i = size; i > 0; i--)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000449 *w++ = *u++;
450 }
451#endif
452
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +0000453 if (size > PyUnicode_GET_SIZE(unicode))
454 return PyUnicode_GET_SIZE(unicode);
455 else
Guido van Rossumd57fd912000-03-10 22:53:23 +0000456 return size;
457}
458
459#endif
460
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000461PyObject *PyUnicode_FromOrdinal(int ordinal)
462{
Hye-Shik Chang40574832004-04-06 07:24:51 +0000463 Py_UNICODE s[1];
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000464
465#ifdef Py_UNICODE_WIDE
466 if (ordinal < 0 || ordinal > 0x10ffff) {
467 PyErr_SetString(PyExc_ValueError,
468 "unichr() arg not in range(0x110000) "
469 "(wide Python build)");
470 return NULL;
471 }
472#else
473 if (ordinal < 0 || ordinal > 0xffff) {
474 PyErr_SetString(PyExc_ValueError,
475 "unichr() arg not in range(0x10000) "
476 "(narrow Python build)");
477 return NULL;
478 }
479#endif
480
Hye-Shik Chang40574832004-04-06 07:24:51 +0000481 s[0] = (Py_UNICODE)ordinal;
482 return PyUnicode_FromUnicode(s, 1);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000483}
484
Guido van Rossumd57fd912000-03-10 22:53:23 +0000485PyObject *PyUnicode_FromObject(register PyObject *obj)
486{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000487 /* XXX Perhaps we should make this API an alias of
488 PyObject_Unicode() instead ?! */
489 if (PyUnicode_CheckExact(obj)) {
490 Py_INCREF(obj);
491 return obj;
492 }
493 if (PyUnicode_Check(obj)) {
494 /* For a Unicode subtype that's not a Unicode object,
495 return a true Unicode object with the same data. */
496 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(obj),
497 PyUnicode_GET_SIZE(obj));
498 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000499 return PyUnicode_FromEncodedObject(obj, NULL, "strict");
500}
501
502PyObject *PyUnicode_FromEncodedObject(register PyObject *obj,
503 const char *encoding,
504 const char *errors)
505{
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000506 const char *s = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +0000507 Py_ssize_t len;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000508 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +0000509
Guido van Rossumd57fd912000-03-10 22:53:23 +0000510 if (obj == NULL) {
511 PyErr_BadInternalCall();
512 return NULL;
513 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000514
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000515#if 0
516 /* For b/w compatibility we also accept Unicode objects provided
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +0000517 that no encodings is given and then redirect to
518 PyObject_Unicode() which then applies the additional logic for
519 Unicode subclasses.
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000520
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000521 NOTE: This API should really only be used for object which
522 represent *encoded* Unicode !
523
524 */
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000525 if (PyUnicode_Check(obj)) {
526 if (encoding) {
527 PyErr_SetString(PyExc_TypeError,
528 "decoding Unicode is not supported");
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000529 return NULL;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000530 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000531 return PyObject_Unicode(obj);
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000532 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000533#else
534 if (PyUnicode_Check(obj)) {
535 PyErr_SetString(PyExc_TypeError,
536 "decoding Unicode is not supported");
537 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000538 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000539#endif
540
541 /* Coerce object */
542 if (PyString_Check(obj)) {
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000543 s = PyString_AS_STRING(obj);
544 len = PyString_GET_SIZE(obj);
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000545 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000546 else if (PyObject_AsCharBuffer(obj, &s, &len)) {
547 /* Overwrite the error message with something more useful in
548 case of a TypeError. */
549 if (PyErr_ExceptionMatches(PyExc_TypeError))
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000550 PyErr_Format(PyExc_TypeError,
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000551 "coercing to Unicode: need string or buffer, "
552 "%.80s found",
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000553 obj->ob_type->tp_name);
554 goto onError;
555 }
Tim Petersced69f82003-09-16 20:30:58 +0000556
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000557 /* Convert to Unicode */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000558 if (len == 0) {
559 Py_INCREF(unicode_empty);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000560 v = (PyObject *)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000561 }
Tim Petersced69f82003-09-16 20:30:58 +0000562 else
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000563 v = PyUnicode_Decode(s, len, encoding, errors);
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +0000564
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000565 return v;
566
567 onError:
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000568 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000569}
570
571PyObject *PyUnicode_Decode(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000572 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +0000573 const char *encoding,
574 const char *errors)
575{
576 PyObject *buffer = NULL, *unicode;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000577
578 if (encoding == NULL)
Fred Drakee4315f52000-05-09 19:53:39 +0000579 encoding = PyUnicode_GetDefaultEncoding();
580
581 /* Shortcuts for common default encodings */
582 if (strcmp(encoding, "utf-8") == 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000583 return PyUnicode_DecodeUTF8(s, size, errors);
Fred Drakee4315f52000-05-09 19:53:39 +0000584 else if (strcmp(encoding, "latin-1") == 0)
585 return PyUnicode_DecodeLatin1(s, size, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +0000586#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
587 else if (strcmp(encoding, "mbcs") == 0)
588 return PyUnicode_DecodeMBCS(s, size, errors);
589#endif
Fred Drakee4315f52000-05-09 19:53:39 +0000590 else if (strcmp(encoding, "ascii") == 0)
591 return PyUnicode_DecodeASCII(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000592
593 /* Decode via the codec registry */
594 buffer = PyBuffer_FromMemory((void *)s, size);
595 if (buffer == NULL)
596 goto onError;
597 unicode = PyCodec_Decode(buffer, encoding, errors);
598 if (unicode == NULL)
599 goto onError;
600 if (!PyUnicode_Check(unicode)) {
601 PyErr_Format(PyExc_TypeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000602 "decoder did not return an unicode object (type=%.400s)",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000603 unicode->ob_type->tp_name);
604 Py_DECREF(unicode);
605 goto onError;
606 }
607 Py_DECREF(buffer);
608 return unicode;
Tim Petersced69f82003-09-16 20:30:58 +0000609
Guido van Rossumd57fd912000-03-10 22:53:23 +0000610 onError:
611 Py_XDECREF(buffer);
612 return NULL;
613}
614
Marc-André Lemburgd2d45982004-07-08 17:57:32 +0000615PyObject *PyUnicode_AsDecodedObject(PyObject *unicode,
616 const char *encoding,
617 const char *errors)
618{
619 PyObject *v;
620
621 if (!PyUnicode_Check(unicode)) {
622 PyErr_BadArgument();
623 goto onError;
624 }
625
626 if (encoding == NULL)
627 encoding = PyUnicode_GetDefaultEncoding();
628
629 /* Decode via the codec registry */
630 v = PyCodec_Decode(unicode, encoding, errors);
631 if (v == NULL)
632 goto onError;
633 return v;
634
635 onError:
636 return NULL;
637}
638
Guido van Rossumd57fd912000-03-10 22:53:23 +0000639PyObject *PyUnicode_Encode(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000640 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +0000641 const char *encoding,
642 const char *errors)
643{
644 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +0000645
Guido van Rossumd57fd912000-03-10 22:53:23 +0000646 unicode = PyUnicode_FromUnicode(s, size);
647 if (unicode == NULL)
648 return NULL;
649 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
650 Py_DECREF(unicode);
651 return v;
652}
653
Marc-André Lemburgd2d45982004-07-08 17:57:32 +0000654PyObject *PyUnicode_AsEncodedObject(PyObject *unicode,
655 const char *encoding,
656 const char *errors)
657{
658 PyObject *v;
659
660 if (!PyUnicode_Check(unicode)) {
661 PyErr_BadArgument();
662 goto onError;
663 }
664
665 if (encoding == NULL)
666 encoding = PyUnicode_GetDefaultEncoding();
667
668 /* Encode via the codec registry */
669 v = PyCodec_Encode(unicode, encoding, errors);
670 if (v == NULL)
671 goto onError;
672 return v;
673
674 onError:
675 return NULL;
676}
677
Guido van Rossumd57fd912000-03-10 22:53:23 +0000678PyObject *PyUnicode_AsEncodedString(PyObject *unicode,
679 const char *encoding,
680 const char *errors)
681{
682 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +0000683
Guido van Rossumd57fd912000-03-10 22:53:23 +0000684 if (!PyUnicode_Check(unicode)) {
685 PyErr_BadArgument();
686 goto onError;
687 }
Fred Drakee4315f52000-05-09 19:53:39 +0000688
Tim Petersced69f82003-09-16 20:30:58 +0000689 if (encoding == NULL)
Fred Drakee4315f52000-05-09 19:53:39 +0000690 encoding = PyUnicode_GetDefaultEncoding();
691
692 /* Shortcuts for common default encodings */
693 if (errors == NULL) {
694 if (strcmp(encoding, "utf-8") == 0)
Jeremy Hylton9cea41c2001-05-29 17:13:15 +0000695 return PyUnicode_AsUTF8String(unicode);
Fred Drakee4315f52000-05-09 19:53:39 +0000696 else if (strcmp(encoding, "latin-1") == 0)
697 return PyUnicode_AsLatin1String(unicode);
Mark Hammond0ccda1e2003-07-01 00:13:27 +0000698#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
699 else if (strcmp(encoding, "mbcs") == 0)
700 return PyUnicode_AsMBCSString(unicode);
701#endif
Fred Drakee4315f52000-05-09 19:53:39 +0000702 else if (strcmp(encoding, "ascii") == 0)
703 return PyUnicode_AsASCIIString(unicode);
704 }
Guido van Rossumd57fd912000-03-10 22:53:23 +0000705
706 /* Encode via the codec registry */
707 v = PyCodec_Encode(unicode, encoding, errors);
708 if (v == NULL)
709 goto onError;
Guido van Rossumf15a29f2007-05-04 00:41:39 +0000710 if (!PyBytes_Check(v)) {
711 if (PyString_Check(v)) {
712 /* Old codec, turn it into bytes */
713 PyObject *b = PyBytes_FromObject(v);
714 Py_DECREF(v);
715 return b;
716 }
Guido van Rossumd57fd912000-03-10 22:53:23 +0000717 PyErr_Format(PyExc_TypeError,
Guido van Rossumf15a29f2007-05-04 00:41:39 +0000718 "encoder did not return a bytes object "
719 "(type=%.400s, encoding=%.20s, errors=%.20s)",
720 v->ob_type->tp_name,
721 encoding ? encoding : "NULL",
722 errors ? errors : "NULL");
Guido van Rossumd57fd912000-03-10 22:53:23 +0000723 Py_DECREF(v);
724 goto onError;
725 }
726 return v;
Tim Petersced69f82003-09-16 20:30:58 +0000727
Guido van Rossumd57fd912000-03-10 22:53:23 +0000728 onError:
729 return NULL;
730}
731
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000732PyObject *_PyUnicode_AsDefaultEncodedString(PyObject *unicode,
733 const char *errors)
734{
735 PyObject *v = ((PyUnicodeObject *)unicode)->defenc;
Guido van Rossumf15a29f2007-05-04 00:41:39 +0000736 PyObject *b;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000737 if (v)
738 return v;
Guido van Rossumf15a29f2007-05-04 00:41:39 +0000739 if (errors != NULL)
740 Py_FatalError("non-NULL encoding in _PyUnicode_AsDefaultEncodedString");
741 if (errors == NULL) {
742 b = PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
743 PyUnicode_GET_SIZE(unicode),
744 NULL);
745 }
746 else {
747 b = PyUnicode_AsEncodedString(unicode, NULL, errors);
748 }
749 if (!b)
750 return NULL;
751 v = PyString_FromStringAndSize(PyBytes_AsString(b),
752 PyBytes_Size(b));
753 Py_DECREF(b);
754 if (!errors) {
755 Py_XINCREF(v);
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000756 ((PyUnicodeObject *)unicode)->defenc = v;
Guido van Rossumf15a29f2007-05-04 00:41:39 +0000757 }
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000758 return v;
759}
760
Guido van Rossumd57fd912000-03-10 22:53:23 +0000761Py_UNICODE *PyUnicode_AsUnicode(PyObject *unicode)
762{
763 if (!PyUnicode_Check(unicode)) {
764 PyErr_BadArgument();
765 goto onError;
766 }
767 return PyUnicode_AS_UNICODE(unicode);
768
769 onError:
770 return NULL;
771}
772
Martin v. Löwis18e16552006-02-15 17:27:45 +0000773Py_ssize_t PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000774{
775 if (!PyUnicode_Check(unicode)) {
776 PyErr_BadArgument();
777 goto onError;
778 }
779 return PyUnicode_GET_SIZE(unicode);
780
781 onError:
782 return -1;
783}
784
Thomas Wouters78890102000-07-22 19:25:51 +0000785const char *PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +0000786{
787 return unicode_default_encoding;
788}
789
790int PyUnicode_SetDefaultEncoding(const char *encoding)
791{
Guido van Rossumf15a29f2007-05-04 00:41:39 +0000792 if (strcmp(encoding, unicode_default_encoding) != 0) {
793 PyErr_Format(PyExc_ValueError,
794 "Can only set default encoding to %s",
795 unicode_default_encoding);
796 return -1;
797 }
Fred Drakee4315f52000-05-09 19:53:39 +0000798 return 0;
Fred Drakee4315f52000-05-09 19:53:39 +0000799}
800
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000801/* error handling callback helper:
802 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +0000803 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000804 and adjust various state variables.
805 return 0 on success, -1 on error
806*/
807
808static
809int unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
810 const char *encoding, const char *reason,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000811 const char *input, Py_ssize_t insize, Py_ssize_t *startinpos, Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
812 PyObject **output, Py_ssize_t *outpos, Py_UNICODE **outptr)
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000813{
Martin v. Löwis18e16552006-02-15 17:27:45 +0000814 static char *argparse = "O!n;decoding error handler must return (unicode, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000815
816 PyObject *restuple = NULL;
817 PyObject *repunicode = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +0000818 Py_ssize_t outsize = PyUnicode_GET_SIZE(*output);
819 Py_ssize_t requiredsize;
820 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000821 Py_UNICODE *repptr;
Martin v. Löwis18e16552006-02-15 17:27:45 +0000822 Py_ssize_t repsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000823 int res = -1;
824
825 if (*errorHandler == NULL) {
826 *errorHandler = PyCodec_LookupError(errors);
827 if (*errorHandler == NULL)
828 goto onError;
829 }
830
831 if (*exceptionObject == NULL) {
832 *exceptionObject = PyUnicodeDecodeError_Create(
833 encoding, input, insize, *startinpos, *endinpos, reason);
834 if (*exceptionObject == NULL)
835 goto onError;
836 }
837 else {
838 if (PyUnicodeDecodeError_SetStart(*exceptionObject, *startinpos))
839 goto onError;
840 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, *endinpos))
841 goto onError;
842 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
843 goto onError;
844 }
845
846 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
847 if (restuple == NULL)
848 goto onError;
849 if (!PyTuple_Check(restuple)) {
850 PyErr_Format(PyExc_TypeError, &argparse[4]);
851 goto onError;
852 }
853 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
854 goto onError;
855 if (newpos<0)
Walter Dörwald2e0b18a2003-01-31 17:19:08 +0000856 newpos = insize+newpos;
857 if (newpos<0 || newpos>insize) {
Martin v. Löwis2c95cc62006-02-16 06:54:25 +0000858 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
Walter Dörwald2e0b18a2003-01-31 17:19:08 +0000859 goto onError;
860 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000861
862 /* need more space? (at least enough for what we
863 have+the replacement+the rest of the string (starting
864 at the new input position), so we won't have to check space
865 when there are no errors in the rest of the string) */
866 repptr = PyUnicode_AS_UNICODE(repunicode);
867 repsize = PyUnicode_GET_SIZE(repunicode);
868 requiredsize = *outpos + repsize + insize-newpos;
869 if (requiredsize > outsize) {
870 if (requiredsize<2*outsize)
871 requiredsize = 2*outsize;
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +0000872 if (PyUnicode_Resize(output, requiredsize) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000873 goto onError;
874 *outptr = PyUnicode_AS_UNICODE(*output) + *outpos;
875 }
876 *endinpos = newpos;
877 *inptr = input + newpos;
878 Py_UNICODE_COPY(*outptr, repptr, repsize);
879 *outptr += repsize;
880 *outpos += repsize;
881 /* we made it! */
882 res = 0;
883
884 onError:
885 Py_XDECREF(restuple);
886 return res;
887}
888
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000889/* --- UTF-7 Codec -------------------------------------------------------- */
890
891/* see RFC2152 for details */
892
Tim Petersced69f82003-09-16 20:30:58 +0000893static
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000894char utf7_special[128] = {
895 /* indicate whether a UTF-7 character is special i.e. cannot be directly
896 encoded:
897 0 - not special
898 1 - special
899 2 - whitespace (optional)
900 3 - RFC2152 Set O (optional) */
901 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 2, 1, 1,
902 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
903 2, 3, 3, 3, 3, 3, 3, 0, 0, 0, 3, 1, 0, 0, 0, 1,
904 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 0,
905 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
906 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 1, 3, 3, 3,
907 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
908 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 1, 1,
909
910};
911
Marc-André Lemburge115ec82005-10-19 22:33:31 +0000912/* Note: The comparison (c) <= 0 is a trick to work-around gcc
913 warnings about the comparison always being false; since
914 utf7_special[0] is 1, we can safely make that one comparison
915 true */
916
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000917#define SPECIAL(c, encodeO, encodeWS) \
Marc-André Lemburge115ec82005-10-19 22:33:31 +0000918 ((c) > 127 || (c) <= 0 || utf7_special[(c)] == 1 || \
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +0000919 (encodeWS && (utf7_special[(c)] == 2)) || \
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000920 (encodeO && (utf7_special[(c)] == 3)))
921
Marc-André Lemburge115ec82005-10-19 22:33:31 +0000922#define B64(n) \
923 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
924#define B64CHAR(c) \
925 (isalnum(c) || (c) == '+' || (c) == '/')
926#define UB64(c) \
927 ((c) == '+' ? 62 : (c) == '/' ? 63 : (c) >= 'a' ? \
928 (c) - 71 : (c) >= 'A' ? (c) - 65 : (c) + 4 )
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000929
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +0000930#define ENCODE(out, ch, bits) \
931 while (bits >= 6) { \
932 *out++ = B64(ch >> (bits-6)); \
933 bits -= 6; \
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000934 }
935
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +0000936#define DECODE(out, ch, bits, surrogate) \
937 while (bits >= 16) { \
938 Py_UNICODE outCh = (Py_UNICODE) ((ch >> (bits-16)) & 0xffff); \
939 bits -= 16; \
940 if (surrogate) { \
Marc-André Lemburge115ec82005-10-19 22:33:31 +0000941 /* We have already generated an error for the high surrogate \
942 so let's not bother seeing if the low surrogate is correct or not */ \
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +0000943 surrogate = 0; \
944 } else if (0xDC00 <= outCh && outCh <= 0xDFFF) { \
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000945 /* This is a surrogate pair. Unfortunately we can't represent \
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +0000946 it in a 16-bit character */ \
947 surrogate = 1; \
948 errmsg = "code pairs are not supported"; \
949 goto utf7Error; \
950 } else { \
951 *out++ = outCh; \
952 } \
Marc-André Lemburge115ec82005-10-19 22:33:31 +0000953 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000954
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000955PyObject *PyUnicode_DecodeUTF7(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000956 Py_ssize_t size,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000957 const char *errors)
958{
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000959 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +0000960 Py_ssize_t startinpos;
961 Py_ssize_t endinpos;
962 Py_ssize_t outpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000963 const char *e;
964 PyUnicodeObject *unicode;
965 Py_UNICODE *p;
966 const char *errmsg = "";
967 int inShift = 0;
968 unsigned int bitsleft = 0;
969 unsigned long charsleft = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000970 int surrogate = 0;
971 PyObject *errorHandler = NULL;
972 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000973
974 unicode = _PyUnicode_New(size);
975 if (!unicode)
976 return NULL;
977 if (size == 0)
978 return (PyObject *)unicode;
979
980 p = unicode->str;
981 e = s + size;
982
983 while (s < e) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000984 Py_UNICODE ch;
985 restart:
986 ch = *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000987
988 if (inShift) {
989 if ((ch == '-') || !B64CHAR(ch)) {
990 inShift = 0;
991 s++;
Tim Petersced69f82003-09-16 20:30:58 +0000992
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000993 /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
994 if (bitsleft >= 6) {
995 /* The shift sequence has a partial character in it. If
996 bitsleft < 6 then we could just classify it as padding
997 but that is not the case here */
998
999 errmsg = "partial character in shift sequence";
Tim Petersced69f82003-09-16 20:30:58 +00001000 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001001 }
1002 /* According to RFC2152 the remaining bits should be zero. We
Tim Petersced69f82003-09-16 20:30:58 +00001003 choose to signal an error/insert a replacement character
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001004 here so indicate the potential of a misencoded character. */
1005
1006 /* On x86, a << b == a << (b%32) so make sure that bitsleft != 0 */
1007 if (bitsleft && charsleft << (sizeof(charsleft) * 8 - bitsleft)) {
1008 errmsg = "non-zero padding bits in shift sequence";
Tim Petersced69f82003-09-16 20:30:58 +00001009 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001010 }
1011
1012 if (ch == '-') {
1013 if ((s < e) && (*(s) == '-')) {
Tim Petersced69f82003-09-16 20:30:58 +00001014 *p++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001015 inShift = 1;
1016 }
1017 } else if (SPECIAL(ch,0,0)) {
1018 errmsg = "unexpected special character";
Tim Petersced69f82003-09-16 20:30:58 +00001019 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001020 } else {
1021 *p++ = ch;
1022 }
1023 } else {
1024 charsleft = (charsleft << 6) | UB64(ch);
1025 bitsleft += 6;
1026 s++;
1027 /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
1028 }
1029 }
1030 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001031 startinpos = s-starts;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001032 s++;
1033 if (s < e && *s == '-') {
1034 s++;
1035 *p++ = '+';
1036 } else
1037 {
1038 inShift = 1;
1039 bitsleft = 0;
1040 }
1041 }
1042 else if (SPECIAL(ch,0,0)) {
1043 errmsg = "unexpected special character";
1044 s++;
Tim Petersced69f82003-09-16 20:30:58 +00001045 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001046 }
1047 else {
1048 *p++ = ch;
1049 s++;
1050 }
1051 continue;
1052 utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001053 outpos = p-PyUnicode_AS_UNICODE(unicode);
1054 endinpos = s-starts;
1055 if (unicode_decode_call_errorhandler(
1056 errors, &errorHandler,
1057 "utf7", errmsg,
1058 starts, size, &startinpos, &endinpos, &exc, &s,
1059 (PyObject **)&unicode, &outpos, &p))
1060 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001061 }
1062
1063 if (inShift) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001064 outpos = p-PyUnicode_AS_UNICODE(unicode);
1065 endinpos = size;
1066 if (unicode_decode_call_errorhandler(
1067 errors, &errorHandler,
1068 "utf7", "unterminated shift sequence",
1069 starts, size, &startinpos, &endinpos, &exc, &s,
1070 (PyObject **)&unicode, &outpos, &p))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001071 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001072 if (s < e)
1073 goto restart;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001074 }
1075
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00001076 if (_PyUnicode_Resize(&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001077 goto onError;
1078
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001079 Py_XDECREF(errorHandler);
1080 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001081 return (PyObject *)unicode;
1082
1083onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001084 Py_XDECREF(errorHandler);
1085 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001086 Py_DECREF(unicode);
1087 return NULL;
1088}
1089
1090
1091PyObject *PyUnicode_EncodeUTF7(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001092 Py_ssize_t size,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001093 int encodeSetO,
1094 int encodeWhiteSpace,
1095 const char *errors)
1096{
1097 PyObject *v;
1098 /* It might be possible to tighten this worst case */
Martin v. Löwis18e16552006-02-15 17:27:45 +00001099 Py_ssize_t cbAllocated = 5 * size;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001100 int inShift = 0;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001101 Py_ssize_t i = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001102 unsigned int bitsleft = 0;
1103 unsigned long charsleft = 0;
1104 char * out;
1105 char * start;
1106
1107 if (size == 0)
1108 return PyString_FromStringAndSize(NULL, 0);
1109
1110 v = PyString_FromStringAndSize(NULL, cbAllocated);
1111 if (v == NULL)
1112 return NULL;
1113
1114 start = out = PyString_AS_STRING(v);
1115 for (;i < size; ++i) {
1116 Py_UNICODE ch = s[i];
1117
1118 if (!inShift) {
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001119 if (ch == '+') {
1120 *out++ = '+';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001121 *out++ = '-';
1122 } else if (SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
1123 charsleft = ch;
1124 bitsleft = 16;
1125 *out++ = '+';
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001126 /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001127 inShift = bitsleft > 0;
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001128 } else {
1129 *out++ = (char) ch;
1130 }
1131 } else {
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001132 if (!SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
1133 *out++ = B64(charsleft << (6-bitsleft));
1134 charsleft = 0;
1135 bitsleft = 0;
1136 /* Characters not in the BASE64 set implicitly unshift the sequence
1137 so no '-' is required, except if the character is itself a '-' */
1138 if (B64CHAR(ch) || ch == '-') {
1139 *out++ = '-';
1140 }
1141 inShift = 0;
1142 *out++ = (char) ch;
1143 } else {
1144 bitsleft += 16;
1145 charsleft = (charsleft << 16) | ch;
1146 /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
1147
1148 /* If the next character is special then we dont' need to terminate
Tim Petersced69f82003-09-16 20:30:58 +00001149 the shift sequence. If the next character is not a BASE64 character
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001150 or '-' then the shift sequence will be terminated implicitly and we
1151 don't have to insert a '-'. */
1152
1153 if (bitsleft == 0) {
1154 if (i + 1 < size) {
1155 Py_UNICODE ch2 = s[i+1];
1156
1157 if (SPECIAL(ch2, encodeSetO, encodeWhiteSpace)) {
Tim Petersced69f82003-09-16 20:30:58 +00001158
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001159 } else if (B64CHAR(ch2) || ch2 == '-') {
1160 *out++ = '-';
1161 inShift = 0;
1162 } else {
1163 inShift = 0;
1164 }
1165
1166 }
1167 else {
1168 *out++ = '-';
1169 inShift = 0;
1170 }
1171 }
Tim Petersced69f82003-09-16 20:30:58 +00001172 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001173 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001174 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001175 if (bitsleft) {
1176 *out++= B64(charsleft << (6-bitsleft) );
1177 *out++ = '-';
1178 }
1179
Tim Peters5de98422002-04-27 18:44:32 +00001180 _PyString_Resize(&v, out - start);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001181 return v;
1182}
1183
1184#undef SPECIAL
1185#undef B64
1186#undef B64CHAR
1187#undef UB64
1188#undef ENCODE
1189#undef DECODE
1190
Guido van Rossumd57fd912000-03-10 22:53:23 +00001191/* --- UTF-8 Codec -------------------------------------------------------- */
1192
Tim Petersced69f82003-09-16 20:30:58 +00001193static
Guido van Rossumd57fd912000-03-10 22:53:23 +00001194char utf8_code_length[256] = {
1195 /* Map UTF-8 encoded prefix byte to sequence length. zero means
1196 illegal prefix. see RFC 2279 for details */
1197 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1198 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1199 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1200 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1201 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1202 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1203 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1204 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1205 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1206 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1207 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1208 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1209 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1210 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1211 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
1212 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0
1213};
1214
Guido van Rossumd57fd912000-03-10 22:53:23 +00001215PyObject *PyUnicode_DecodeUTF8(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001216 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00001217 const char *errors)
1218{
Walter Dörwald69652032004-09-07 20:24:22 +00001219 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
1220}
1221
1222PyObject *PyUnicode_DecodeUTF8Stateful(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001223 Py_ssize_t size,
Walter Dörwald69652032004-09-07 20:24:22 +00001224 const char *errors,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001225 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00001226{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001227 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001228 int n;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001229 Py_ssize_t startinpos;
1230 Py_ssize_t endinpos;
1231 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001232 const char *e;
1233 PyUnicodeObject *unicode;
1234 Py_UNICODE *p;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001235 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001236 PyObject *errorHandler = NULL;
1237 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001238
1239 /* Note: size will always be longer than the resulting Unicode
1240 character count */
1241 unicode = _PyUnicode_New(size);
1242 if (!unicode)
1243 return NULL;
Walter Dörwald69652032004-09-07 20:24:22 +00001244 if (size == 0) {
1245 if (consumed)
1246 *consumed = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001247 return (PyObject *)unicode;
Walter Dörwald69652032004-09-07 20:24:22 +00001248 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001249
1250 /* Unpack UTF-8 encoded data */
1251 p = unicode->str;
1252 e = s + size;
1253
1254 while (s < e) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001255 Py_UCS4 ch = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001256
1257 if (ch < 0x80) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001258 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001259 s++;
1260 continue;
1261 }
1262
1263 n = utf8_code_length[ch];
1264
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001265 if (s + n > e) {
Walter Dörwald69652032004-09-07 20:24:22 +00001266 if (consumed)
1267 break;
1268 else {
1269 errmsg = "unexpected end of data";
1270 startinpos = s-starts;
1271 endinpos = size;
1272 goto utf8Error;
1273 }
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001274 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001275
1276 switch (n) {
1277
1278 case 0:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001279 errmsg = "unexpected code byte";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001280 startinpos = s-starts;
1281 endinpos = startinpos+1;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001282 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001283
1284 case 1:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001285 errmsg = "internal error";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001286 startinpos = s-starts;
1287 endinpos = startinpos+1;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001288 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001289
1290 case 2:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001291 if ((s[1] & 0xc0) != 0x80) {
1292 errmsg = "invalid data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001293 startinpos = s-starts;
1294 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001295 goto utf8Error;
1296 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001297 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001298 if (ch < 0x80) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001299 startinpos = s-starts;
1300 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001301 errmsg = "illegal encoding";
1302 goto utf8Error;
1303 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001304 else
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001305 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001306 break;
1307
1308 case 3:
Tim Petersced69f82003-09-16 20:30:58 +00001309 if ((s[1] & 0xc0) != 0x80 ||
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001310 (s[2] & 0xc0) != 0x80) {
1311 errmsg = "invalid data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001312 startinpos = s-starts;
1313 endinpos = startinpos+3;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001314 goto utf8Error;
1315 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001316 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001317 if (ch < 0x0800) {
1318 /* Note: UTF-8 encodings of surrogates are considered
Tim Petersced69f82003-09-16 20:30:58 +00001319 legal UTF-8 sequences;
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001320
1321 XXX For wide builds (UCS-4) we should probably try
1322 to recombine the surrogates into a single code
1323 unit.
1324 */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001325 errmsg = "illegal encoding";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001326 startinpos = s-starts;
1327 endinpos = startinpos+3;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001328 goto utf8Error;
1329 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001330 else
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001331 *p++ = (Py_UNICODE)ch;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001332 break;
1333
1334 case 4:
1335 if ((s[1] & 0xc0) != 0x80 ||
1336 (s[2] & 0xc0) != 0x80 ||
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001337 (s[3] & 0xc0) != 0x80) {
1338 errmsg = "invalid data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001339 startinpos = s-starts;
1340 endinpos = startinpos+4;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001341 goto utf8Error;
1342 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001343 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
1344 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
1345 /* validate and convert to UTF-16 */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001346 if ((ch < 0x10000) /* minimum value allowed for 4
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001347 byte encoding */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001348 || (ch > 0x10ffff)) /* maximum value allowed for
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001349 UTF-16 */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001350 {
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001351 errmsg = "illegal encoding";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001352 startinpos = s-starts;
1353 endinpos = startinpos+4;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001354 goto utf8Error;
1355 }
Fredrik Lundh8f455852001-06-27 18:59:43 +00001356#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001357 *p++ = (Py_UNICODE)ch;
1358#else
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001359 /* compute and append the two surrogates: */
Tim Petersced69f82003-09-16 20:30:58 +00001360
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001361 /* translate from 10000..10FFFF to 0..FFFF */
1362 ch -= 0x10000;
Tim Petersced69f82003-09-16 20:30:58 +00001363
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001364 /* high surrogate = top 10 bits added to D800 */
1365 *p++ = (Py_UNICODE)(0xD800 + (ch >> 10));
Tim Petersced69f82003-09-16 20:30:58 +00001366
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001367 /* low surrogate = bottom 10 bits added to DC00 */
Fredrik Lundh45714e92001-06-26 16:39:36 +00001368 *p++ = (Py_UNICODE)(0xDC00 + (ch & 0x03FF));
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001369#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00001370 break;
1371
1372 default:
1373 /* Other sizes are only needed for UCS-4 */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001374 errmsg = "unsupported Unicode code range";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001375 startinpos = s-starts;
1376 endinpos = startinpos+n;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001377 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001378 }
1379 s += n;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001380 continue;
Tim Petersced69f82003-09-16 20:30:58 +00001381
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001382 utf8Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001383 outpos = p-PyUnicode_AS_UNICODE(unicode);
1384 if (unicode_decode_call_errorhandler(
1385 errors, &errorHandler,
1386 "utf8", errmsg,
1387 starts, size, &startinpos, &endinpos, &exc, &s,
1388 (PyObject **)&unicode, &outpos, &p))
1389 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001390 }
Walter Dörwald69652032004-09-07 20:24:22 +00001391 if (consumed)
1392 *consumed = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001393
1394 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00001395 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001396 goto onError;
1397
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001398 Py_XDECREF(errorHandler);
1399 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001400 return (PyObject *)unicode;
1401
1402onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001403 Py_XDECREF(errorHandler);
1404 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001405 Py_DECREF(unicode);
1406 return NULL;
1407}
1408
Tim Peters602f7402002-04-27 18:03:26 +00001409/* Allocation strategy: if the string is short, convert into a stack buffer
1410 and allocate exactly as much space needed at the end. Else allocate the
1411 maximum possible needed (4 result bytes per Unicode character), and return
1412 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001413*/
Tim Peters7e3d9612002-04-21 03:26:37 +00001414PyObject *
1415PyUnicode_EncodeUTF8(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001416 Py_ssize_t size,
Tim Peters7e3d9612002-04-21 03:26:37 +00001417 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001418{
Tim Peters602f7402002-04-27 18:03:26 +00001419#define MAX_SHORT_UNICHARS 300 /* largest size we'll do on the stack */
Tim Peters0eca65c2002-04-21 17:28:06 +00001420
Martin v. Löwis18e16552006-02-15 17:27:45 +00001421 Py_ssize_t i; /* index into s of next input byte */
Tim Peters602f7402002-04-27 18:03:26 +00001422 PyObject *v; /* result string object */
1423 char *p; /* next free byte in output buffer */
Martin v. Löwis18e16552006-02-15 17:27:45 +00001424 Py_ssize_t nallocated; /* number of result bytes allocated */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001425 Py_ssize_t nneeded; /* number of result bytes needed */
Tim Peters602f7402002-04-27 18:03:26 +00001426 char stackbuf[MAX_SHORT_UNICHARS * 4];
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001427
Tim Peters602f7402002-04-27 18:03:26 +00001428 assert(s != NULL);
1429 assert(size >= 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001430
Tim Peters602f7402002-04-27 18:03:26 +00001431 if (size <= MAX_SHORT_UNICHARS) {
1432 /* Write into the stack buffer; nallocated can't overflow.
1433 * At the end, we'll allocate exactly as much heap space as it
1434 * turns out we need.
1435 */
1436 nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int);
1437 v = NULL; /* will allocate after we're done */
1438 p = stackbuf;
1439 }
1440 else {
1441 /* Overallocate on the heap, and give the excess back at the end. */
1442 nallocated = size * 4;
1443 if (nallocated / 4 != size) /* overflow! */
1444 return PyErr_NoMemory();
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001445 v = PyBytes_FromStringAndSize(NULL, nallocated);
Tim Peters602f7402002-04-27 18:03:26 +00001446 if (v == NULL)
1447 return NULL;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001448 p = PyBytes_AS_STRING(v);
Tim Peters602f7402002-04-27 18:03:26 +00001449 }
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001450
Tim Peters602f7402002-04-27 18:03:26 +00001451 for (i = 0; i < size;) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001452 Py_UCS4 ch = s[i++];
Marc-André Lemburg3688a882002-02-06 18:09:02 +00001453
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001454 if (ch < 0x80)
Tim Peters602f7402002-04-27 18:03:26 +00001455 /* Encode ASCII */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001456 *p++ = (char) ch;
Marc-André Lemburg3688a882002-02-06 18:09:02 +00001457
Guido van Rossumd57fd912000-03-10 22:53:23 +00001458 else if (ch < 0x0800) {
Tim Peters602f7402002-04-27 18:03:26 +00001459 /* Encode Latin-1 */
Marc-André Lemburgdc724d62002-02-06 18:20:19 +00001460 *p++ = (char)(0xc0 | (ch >> 6));
1461 *p++ = (char)(0x80 | (ch & 0x3f));
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001462 }
Marc-André Lemburg3688a882002-02-06 18:09:02 +00001463 else {
Tim Peters602f7402002-04-27 18:03:26 +00001464 /* Encode UCS2 Unicode ordinals */
1465 if (ch < 0x10000) {
1466 /* Special case: check for high surrogate */
1467 if (0xD800 <= ch && ch <= 0xDBFF && i != size) {
1468 Py_UCS4 ch2 = s[i];
1469 /* Check for low surrogate and combine the two to
1470 form a UCS4 value */
1471 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001472 ch = ((ch - 0xD800) << 10 | (ch2 - 0xDC00)) + 0x10000;
Tim Peters602f7402002-04-27 18:03:26 +00001473 i++;
1474 goto encodeUCS4;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001475 }
Tim Peters602f7402002-04-27 18:03:26 +00001476 /* Fall through: handles isolated high surrogates */
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001477 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001478 *p++ = (char)(0xe0 | (ch >> 12));
Tim Peters602f7402002-04-27 18:03:26 +00001479 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
1480 *p++ = (char)(0x80 | (ch & 0x3f));
1481 continue;
1482 }
1483encodeUCS4:
1484 /* Encode UCS4 Unicode ordinals */
1485 *p++ = (char)(0xf0 | (ch >> 18));
1486 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
1487 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
1488 *p++ = (char)(0x80 | (ch & 0x3f));
1489 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001490 }
Tim Peters0eca65c2002-04-21 17:28:06 +00001491
Tim Peters602f7402002-04-27 18:03:26 +00001492 if (v == NULL) {
1493 /* This was stack allocated. */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001494 nneeded = p - stackbuf;
Tim Peters602f7402002-04-27 18:03:26 +00001495 assert(nneeded <= nallocated);
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001496 v = PyBytes_FromStringAndSize(stackbuf, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00001497 }
1498 else {
1499 /* Cut back to size actually needed. */
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001500 nneeded = p - PyBytes_AS_STRING(v);
Tim Peters602f7402002-04-27 18:03:26 +00001501 assert(nneeded <= nallocated);
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001502 PyBytes_Resize(v, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00001503 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001504 return v;
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001505
Tim Peters602f7402002-04-27 18:03:26 +00001506#undef MAX_SHORT_UNICHARS
Guido van Rossumd57fd912000-03-10 22:53:23 +00001507}
1508
Guido van Rossumd57fd912000-03-10 22:53:23 +00001509PyObject *PyUnicode_AsUTF8String(PyObject *unicode)
1510{
Guido van Rossumd57fd912000-03-10 22:53:23 +00001511 if (!PyUnicode_Check(unicode)) {
1512 PyErr_BadArgument();
1513 return NULL;
1514 }
Barry Warsaw2dd4abf2000-08-18 06:58:15 +00001515 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
1516 PyUnicode_GET_SIZE(unicode),
1517 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001518}
1519
1520/* --- UTF-16 Codec ------------------------------------------------------- */
1521
Tim Peters772747b2001-08-09 22:21:55 +00001522PyObject *
1523PyUnicode_DecodeUTF16(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001524 Py_ssize_t size,
Tim Peters772747b2001-08-09 22:21:55 +00001525 const char *errors,
1526 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001527{
Walter Dörwald69652032004-09-07 20:24:22 +00001528 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
1529}
1530
1531PyObject *
1532PyUnicode_DecodeUTF16Stateful(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001533 Py_ssize_t size,
Walter Dörwald69652032004-09-07 20:24:22 +00001534 const char *errors,
1535 int *byteorder,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001536 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00001537{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001538 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001539 Py_ssize_t startinpos;
1540 Py_ssize_t endinpos;
1541 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001542 PyUnicodeObject *unicode;
1543 Py_UNICODE *p;
Tim Peters772747b2001-08-09 22:21:55 +00001544 const unsigned char *q, *e;
1545 int bo = 0; /* assume native ordering by default */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001546 const char *errmsg = "";
Tim Peters772747b2001-08-09 22:21:55 +00001547 /* Offsets from q for retrieving byte pairs in the right order. */
1548#ifdef BYTEORDER_IS_LITTLE_ENDIAN
1549 int ihi = 1, ilo = 0;
1550#else
1551 int ihi = 0, ilo = 1;
1552#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001553 PyObject *errorHandler = NULL;
1554 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001555
1556 /* Note: size will always be longer than the resulting Unicode
1557 character count */
1558 unicode = _PyUnicode_New(size);
1559 if (!unicode)
1560 return NULL;
1561 if (size == 0)
1562 return (PyObject *)unicode;
1563
1564 /* Unpack UTF-16 encoded data */
1565 p = unicode->str;
Tim Peters772747b2001-08-09 22:21:55 +00001566 q = (unsigned char *)s;
1567 e = q + size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001568
1569 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00001570 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001571
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001572 /* Check for BOM marks (U+FEFF) in the input and adjust current
1573 byte order setting accordingly. In native mode, the leading BOM
1574 mark is skipped, in all other modes, it is copied to the output
1575 stream as-is (giving a ZWNBSP character). */
1576 if (bo == 0) {
Walter Dörwald69652032004-09-07 20:24:22 +00001577 if (size >= 2) {
1578 const Py_UNICODE bom = (q[ihi] << 8) | q[ilo];
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001579#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Walter Dörwald69652032004-09-07 20:24:22 +00001580 if (bom == 0xFEFF) {
1581 q += 2;
1582 bo = -1;
1583 }
1584 else if (bom == 0xFFFE) {
1585 q += 2;
1586 bo = 1;
1587 }
Tim Petersced69f82003-09-16 20:30:58 +00001588#else
Walter Dörwald69652032004-09-07 20:24:22 +00001589 if (bom == 0xFEFF) {
1590 q += 2;
1591 bo = 1;
1592 }
1593 else if (bom == 0xFFFE) {
1594 q += 2;
1595 bo = -1;
1596 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001597#endif
Walter Dörwald69652032004-09-07 20:24:22 +00001598 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001599 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001600
Tim Peters772747b2001-08-09 22:21:55 +00001601 if (bo == -1) {
1602 /* force LE */
1603 ihi = 1;
1604 ilo = 0;
1605 }
1606 else if (bo == 1) {
1607 /* force BE */
1608 ihi = 0;
1609 ilo = 1;
1610 }
1611
1612 while (q < e) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001613 Py_UNICODE ch;
Walter Dörwald69652032004-09-07 20:24:22 +00001614 /* remaining bytes at the end? (size should be even) */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001615 if (e-q<2) {
Walter Dörwald69652032004-09-07 20:24:22 +00001616 if (consumed)
1617 break;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001618 errmsg = "truncated data";
1619 startinpos = ((const char *)q)-starts;
1620 endinpos = ((const char *)e)-starts;
1621 goto utf16Error;
1622 /* The remaining input chars are ignored if the callback
1623 chooses to skip the input */
1624 }
1625 ch = (q[ihi] << 8) | q[ilo];
1626
Tim Peters772747b2001-08-09 22:21:55 +00001627 q += 2;
1628
Guido van Rossumd57fd912000-03-10 22:53:23 +00001629 if (ch < 0xD800 || ch > 0xDFFF) {
1630 *p++ = ch;
1631 continue;
1632 }
1633
1634 /* UTF-16 code pair: */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001635 if (q >= e) {
1636 errmsg = "unexpected end of data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001637 startinpos = (((const char *)q)-2)-starts;
1638 endinpos = ((const char *)e)-starts;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001639 goto utf16Error;
1640 }
Martin v. Löwisac93bc22001-06-26 22:43:40 +00001641 if (0xD800 <= ch && ch <= 0xDBFF) {
Tim Peters772747b2001-08-09 22:21:55 +00001642 Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo];
1643 q += 2;
Martin v. Löwisac93bc22001-06-26 22:43:40 +00001644 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Fredrik Lundh8f455852001-06-27 18:59:43 +00001645#ifndef Py_UNICODE_WIDE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001646 *p++ = ch;
1647 *p++ = ch2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001648#else
1649 *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001650#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001651 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001652 }
1653 else {
1654 errmsg = "illegal UTF-16 surrogate";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001655 startinpos = (((const char *)q)-4)-starts;
1656 endinpos = startinpos+2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001657 goto utf16Error;
1658 }
1659
Guido van Rossumd57fd912000-03-10 22:53:23 +00001660 }
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001661 errmsg = "illegal encoding";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001662 startinpos = (((const char *)q)-2)-starts;
1663 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001664 /* Fall through to report the error */
1665
1666 utf16Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001667 outpos = p-PyUnicode_AS_UNICODE(unicode);
1668 if (unicode_decode_call_errorhandler(
1669 errors, &errorHandler,
1670 "utf16", errmsg,
1671 starts, size, &startinpos, &endinpos, &exc, (const char **)&q,
1672 (PyObject **)&unicode, &outpos, &p))
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001673 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001674 }
1675
1676 if (byteorder)
1677 *byteorder = bo;
1678
Walter Dörwald69652032004-09-07 20:24:22 +00001679 if (consumed)
1680 *consumed = (const char *)q-starts;
1681
Guido van Rossumd57fd912000-03-10 22:53:23 +00001682 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00001683 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001684 goto onError;
1685
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001686 Py_XDECREF(errorHandler);
1687 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001688 return (PyObject *)unicode;
1689
1690onError:
1691 Py_DECREF(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001692 Py_XDECREF(errorHandler);
1693 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001694 return NULL;
1695}
1696
Tim Peters772747b2001-08-09 22:21:55 +00001697PyObject *
1698PyUnicode_EncodeUTF16(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001699 Py_ssize_t size,
Tim Peters772747b2001-08-09 22:21:55 +00001700 const char *errors,
1701 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001702{
1703 PyObject *v;
Tim Peters772747b2001-08-09 22:21:55 +00001704 unsigned char *p;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00001705#ifdef Py_UNICODE_WIDE
Tim Peters772747b2001-08-09 22:21:55 +00001706 int i, pairs;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00001707#else
1708 const int pairs = 0;
1709#endif
Tim Peters772747b2001-08-09 22:21:55 +00001710 /* Offsets from p for storing byte pairs in the right order. */
1711#ifdef BYTEORDER_IS_LITTLE_ENDIAN
1712 int ihi = 1, ilo = 0;
1713#else
1714 int ihi = 0, ilo = 1;
1715#endif
1716
1717#define STORECHAR(CH) \
1718 do { \
1719 p[ihi] = ((CH) >> 8) & 0xff; \
1720 p[ilo] = (CH) & 0xff; \
1721 p += 2; \
1722 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001723
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00001724#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001725 for (i = pairs = 0; i < size; i++)
1726 if (s[i] >= 0x10000)
1727 pairs++;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00001728#endif
Tim Petersced69f82003-09-16 20:30:58 +00001729 v = PyString_FromStringAndSize(NULL,
Tim Peters772747b2001-08-09 22:21:55 +00001730 2 * (size + pairs + (byteorder == 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +00001731 if (v == NULL)
1732 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001733
Tim Peters772747b2001-08-09 22:21:55 +00001734 p = (unsigned char *)PyString_AS_STRING(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001735 if (byteorder == 0)
Tim Peters772747b2001-08-09 22:21:55 +00001736 STORECHAR(0xFEFF);
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00001737 if (size == 0)
Marc-André Lemburgb7520772000-08-14 11:29:19 +00001738 return v;
Tim Peters772747b2001-08-09 22:21:55 +00001739
1740 if (byteorder == -1) {
1741 /* force LE */
1742 ihi = 1;
1743 ilo = 0;
1744 }
1745 else if (byteorder == 1) {
1746 /* force BE */
1747 ihi = 0;
1748 ilo = 1;
1749 }
1750
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001751 while (size-- > 0) {
1752 Py_UNICODE ch = *s++;
1753 Py_UNICODE ch2 = 0;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00001754#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001755 if (ch >= 0x10000) {
Tim Peters772747b2001-08-09 22:21:55 +00001756 ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF);
1757 ch = 0xD800 | ((ch-0x10000) >> 10);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001758 }
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00001759#endif
Tim Peters772747b2001-08-09 22:21:55 +00001760 STORECHAR(ch);
1761 if (ch2)
1762 STORECHAR(ch2);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001763 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001764 return v;
Tim Peters772747b2001-08-09 22:21:55 +00001765#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00001766}
1767
1768PyObject *PyUnicode_AsUTF16String(PyObject *unicode)
1769{
1770 if (!PyUnicode_Check(unicode)) {
1771 PyErr_BadArgument();
1772 return NULL;
1773 }
1774 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
1775 PyUnicode_GET_SIZE(unicode),
1776 NULL,
1777 0);
1778}
1779
1780/* --- Unicode Escape Codec ----------------------------------------------- */
1781
Fredrik Lundh06d12682001-01-24 07:59:11 +00001782static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00001783
Guido van Rossumd57fd912000-03-10 22:53:23 +00001784PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001785 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00001786 const char *errors)
1787{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001788 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001789 Py_ssize_t startinpos;
1790 Py_ssize_t endinpos;
1791 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001792 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001793 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001794 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001795 const char *end;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001796 char* message;
1797 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001798 PyObject *errorHandler = NULL;
1799 PyObject *exc = NULL;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001800
Guido van Rossumd57fd912000-03-10 22:53:23 +00001801 /* Escaped strings will always be longer than the resulting
1802 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001803 length after conversion to the true value.
1804 (but if the error callback returns a long replacement string
1805 we'll have to allocate more space) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001806 v = _PyUnicode_New(size);
1807 if (v == NULL)
1808 goto onError;
1809 if (size == 0)
1810 return (PyObject *)v;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001811
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001812 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001813 end = s + size;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001814
Guido van Rossumd57fd912000-03-10 22:53:23 +00001815 while (s < end) {
1816 unsigned char c;
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00001817 Py_UNICODE x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001818 int digits;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001819
1820 /* Non-escape characters are interpreted as Unicode ordinals */
1821 if (*s != '\\') {
Fredrik Lundhccc74732001-02-18 22:13:49 +00001822 *p++ = (unsigned char) *s++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001823 continue;
1824 }
1825
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001826 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001827 /* \ - Escapes */
1828 s++;
1829 switch (*s++) {
1830
1831 /* \x escapes */
1832 case '\n': break;
1833 case '\\': *p++ = '\\'; break;
1834 case '\'': *p++ = '\''; break;
1835 case '\"': *p++ = '\"'; break;
1836 case 'b': *p++ = '\b'; break;
1837 case 'f': *p++ = '\014'; break; /* FF */
1838 case 't': *p++ = '\t'; break;
1839 case 'n': *p++ = '\n'; break;
1840 case 'r': *p++ = '\r'; break;
1841 case 'v': *p++ = '\013'; break; /* VT */
1842 case 'a': *p++ = '\007'; break; /* BEL, not classic C */
1843
1844 /* \OOO (octal) escapes */
1845 case '0': case '1': case '2': case '3':
1846 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001847 x = s[-1] - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00001848 if ('0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001849 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00001850 if ('0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001851 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00001852 }
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001853 *p++ = x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001854 break;
1855
Fredrik Lundhccc74732001-02-18 22:13:49 +00001856 /* hex escapes */
1857 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001858 case 'x':
Fredrik Lundhccc74732001-02-18 22:13:49 +00001859 digits = 2;
1860 message = "truncated \\xXX escape";
1861 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001862
Fredrik Lundhccc74732001-02-18 22:13:49 +00001863 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001864 case 'u':
Fredrik Lundhccc74732001-02-18 22:13:49 +00001865 digits = 4;
1866 message = "truncated \\uXXXX escape";
1867 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001868
Fredrik Lundhccc74732001-02-18 22:13:49 +00001869 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00001870 case 'U':
Fredrik Lundhccc74732001-02-18 22:13:49 +00001871 digits = 8;
1872 message = "truncated \\UXXXXXXXX escape";
1873 hexescape:
1874 chr = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001875 outpos = p-PyUnicode_AS_UNICODE(v);
1876 if (s+digits>end) {
1877 endinpos = size;
1878 if (unicode_decode_call_errorhandler(
1879 errors, &errorHandler,
1880 "unicodeescape", "end of string in escape sequence",
1881 starts, size, &startinpos, &endinpos, &exc, &s,
1882 (PyObject **)&v, &outpos, &p))
1883 goto onError;
1884 goto nextByte;
1885 }
1886 for (i = 0; i < digits; ++i) {
Fredrik Lundhccc74732001-02-18 22:13:49 +00001887 c = (unsigned char) s[i];
Fredrik Lundhdf846752000-09-03 11:29:49 +00001888 if (!isxdigit(c)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001889 endinpos = (s+i+1)-starts;
1890 if (unicode_decode_call_errorhandler(
1891 errors, &errorHandler,
1892 "unicodeescape", message,
1893 starts, size, &startinpos, &endinpos, &exc, &s,
1894 (PyObject **)&v, &outpos, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00001895 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001896 goto nextByte;
Fredrik Lundhdf846752000-09-03 11:29:49 +00001897 }
1898 chr = (chr<<4) & ~0xF;
1899 if (c >= '0' && c <= '9')
1900 chr += c - '0';
1901 else if (c >= 'a' && c <= 'f')
1902 chr += 10 + c - 'a';
1903 else
1904 chr += 10 + c - 'A';
1905 }
1906 s += i;
Jeremy Hylton504de6b2003-10-06 05:08:26 +00001907 if (chr == 0xffffffff && PyErr_Occurred())
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001908 /* _decoding_error will have already written into the
1909 target buffer. */
1910 break;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001911 store:
Fredrik Lundhdf846752000-09-03 11:29:49 +00001912 /* when we get here, chr is a 32-bit unicode character */
1913 if (chr <= 0xffff)
1914 /* UCS-2 character */
1915 *p++ = (Py_UNICODE) chr;
1916 else if (chr <= 0x10ffff) {
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001917 /* UCS-4 character. Either store directly, or as
Walter Dörwald8c077222002-03-25 11:16:18 +00001918 surrogate pair. */
Fredrik Lundh8f455852001-06-27 18:59:43 +00001919#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001920 *p++ = chr;
1921#else
Fredrik Lundhdf846752000-09-03 11:29:49 +00001922 chr -= 0x10000L;
1923 *p++ = 0xD800 + (Py_UNICODE) (chr >> 10);
Fredrik Lundh45714e92001-06-26 16:39:36 +00001924 *p++ = 0xDC00 + (Py_UNICODE) (chr & 0x03FF);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001925#endif
Fredrik Lundhdf846752000-09-03 11:29:49 +00001926 } else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001927 endinpos = s-starts;
1928 outpos = p-PyUnicode_AS_UNICODE(v);
1929 if (unicode_decode_call_errorhandler(
1930 errors, &errorHandler,
1931 "unicodeescape", "illegal Unicode character",
1932 starts, size, &startinpos, &endinpos, &exc, &s,
1933 (PyObject **)&v, &outpos, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00001934 goto onError;
1935 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00001936 break;
1937
1938 /* \N{name} */
1939 case 'N':
1940 message = "malformed \\N character escape";
1941 if (ucnhash_CAPI == NULL) {
1942 /* load the unicode data module */
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00001943 PyObject *m, *api;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001944 m = PyImport_ImportModule("unicodedata");
1945 if (m == NULL)
1946 goto ucnhashError;
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00001947 api = PyObject_GetAttrString(m, "ucnhash_CAPI");
Fredrik Lundhccc74732001-02-18 22:13:49 +00001948 Py_DECREF(m);
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00001949 if (api == NULL)
Fredrik Lundhccc74732001-02-18 22:13:49 +00001950 goto ucnhashError;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001951 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCObject_AsVoidPtr(api);
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00001952 Py_DECREF(api);
Fredrik Lundhccc74732001-02-18 22:13:49 +00001953 if (ucnhash_CAPI == NULL)
1954 goto ucnhashError;
1955 }
1956 if (*s == '{') {
1957 const char *start = s+1;
1958 /* look for the closing brace */
1959 while (*s != '}' && s < end)
1960 s++;
1961 if (s > start && s < end && *s == '}') {
1962 /* found a name. look it up in the unicode database */
1963 message = "unknown Unicode character name";
1964 s++;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +00001965 if (ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1), &chr))
Fredrik Lundhccc74732001-02-18 22:13:49 +00001966 goto store;
1967 }
1968 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001969 endinpos = s-starts;
1970 outpos = p-PyUnicode_AS_UNICODE(v);
1971 if (unicode_decode_call_errorhandler(
1972 errors, &errorHandler,
1973 "unicodeescape", message,
1974 starts, size, &startinpos, &endinpos, &exc, &s,
1975 (PyObject **)&v, &outpos, &p))
Fredrik Lundhccc74732001-02-18 22:13:49 +00001976 goto onError;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001977 break;
1978
1979 default:
Walter Dörwald8c077222002-03-25 11:16:18 +00001980 if (s > end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001981 message = "\\ at end of string";
1982 s--;
1983 endinpos = s-starts;
1984 outpos = p-PyUnicode_AS_UNICODE(v);
1985 if (unicode_decode_call_errorhandler(
1986 errors, &errorHandler,
1987 "unicodeescape", message,
1988 starts, size, &startinpos, &endinpos, &exc, &s,
1989 (PyObject **)&v, &outpos, &p))
Walter Dörwald8c077222002-03-25 11:16:18 +00001990 goto onError;
1991 }
1992 else {
1993 *p++ = '\\';
1994 *p++ = (unsigned char)s[-1];
1995 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00001996 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001997 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001998 nextByte:
1999 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002000 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002001 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002002 goto onError;
Walter Dörwaldd4ade082003-08-15 15:00:26 +00002003 Py_XDECREF(errorHandler);
2004 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002005 return (PyObject *)v;
Walter Dörwald8c077222002-03-25 11:16:18 +00002006
Fredrik Lundhccc74732001-02-18 22:13:49 +00002007ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00002008 PyErr_SetString(
2009 PyExc_UnicodeError,
2010 "\\N escapes not supported (can't load unicodedata module)"
2011 );
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00002012 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002013 Py_XDECREF(errorHandler);
2014 Py_XDECREF(exc);
Fredrik Lundhf6056062001-01-20 11:15:25 +00002015 return NULL;
2016
Fredrik Lundhccc74732001-02-18 22:13:49 +00002017onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00002018 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002019 Py_XDECREF(errorHandler);
2020 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002021 return NULL;
2022}
2023
2024/* Return a Unicode-Escape string version of the Unicode object.
2025
2026 If quotes is true, the string is enclosed in u"" or u'' quotes as
2027 appropriate.
2028
2029*/
2030
Thomas Wouters477c8d52006-05-27 19:21:47 +00002031Py_LOCAL_INLINE(const Py_UNICODE *) findchar(const Py_UNICODE *s,
2032 Py_ssize_t size,
2033 Py_UNICODE ch)
2034{
2035 /* like wcschr, but doesn't stop at NULL characters */
2036
2037 while (size-- > 0) {
2038 if (*s == ch)
2039 return s;
2040 s++;
2041 }
2042
2043 return NULL;
2044}
Barry Warsaw51ac5802000-03-20 16:36:48 +00002045
Guido van Rossumd57fd912000-03-10 22:53:23 +00002046static
2047PyObject *unicodeescape_string(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002048 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00002049 int quotes)
2050{
2051 PyObject *repr;
2052 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002053
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00002054 static const char *hexdigit = "0123456789abcdef";
Guido van Rossumd57fd912000-03-10 22:53:23 +00002055
Thomas Wouters89f507f2006-12-13 04:49:30 +00002056 /* XXX(nnorwitz): rather than over-allocating, it would be
2057 better to choose a different scheme. Perhaps scan the
2058 first N-chars of the string and allocate based on that size.
2059 */
2060 /* Initial allocation is based on the longest-possible unichr
2061 escape.
2062
2063 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
2064 unichr, so in this case it's the longest unichr escape. In
2065 narrow (UTF-16) builds this is five chars per source unichr
2066 since there are two unichrs in the surrogate pair, so in narrow
2067 (UTF-16) builds it's not the longest unichr escape.
2068
2069 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
2070 so in the narrow (UTF-16) build case it's the longest unichr
2071 escape.
2072 */
2073
2074 repr = PyString_FromStringAndSize(NULL,
2075 2
2076#ifdef Py_UNICODE_WIDE
2077 + 10*size
2078#else
2079 + 6*size
2080#endif
2081 + 1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002082 if (repr == NULL)
2083 return NULL;
2084
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002085 p = PyString_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002086
2087 if (quotes) {
Tim Petersced69f82003-09-16 20:30:58 +00002088 *p++ = (findchar(s, size, '\'') &&
Guido van Rossumd57fd912000-03-10 22:53:23 +00002089 !findchar(s, size, '"')) ? '"' : '\'';
2090 }
2091 while (size-- > 0) {
2092 Py_UNICODE ch = *s++;
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002093
Hye-Shik Chang835b2432005-12-17 04:38:31 +00002094 /* Escape quotes and backslashes */
2095 if ((quotes &&
Guido van Rossum572dbf82007-04-27 23:53:51 +00002096 ch == (Py_UNICODE) PyString_AS_STRING(repr)[0]) || ch == '\\') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002097 *p++ = '\\';
2098 *p++ = (char) ch;
Guido van Rossumad9744a2001-09-21 15:38:17 +00002099 continue;
Tim Petersced69f82003-09-16 20:30:58 +00002100 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002101
Guido van Rossum0d42e0c2001-07-20 16:36:21 +00002102#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002103 /* Map 21-bit characters to '\U00xxxxxx' */
2104 else if (ch >= 0x10000) {
2105 *p++ = '\\';
2106 *p++ = 'U';
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002107 *p++ = hexdigit[(ch >> 28) & 0x0000000F];
2108 *p++ = hexdigit[(ch >> 24) & 0x0000000F];
2109 *p++ = hexdigit[(ch >> 20) & 0x0000000F];
2110 *p++ = hexdigit[(ch >> 16) & 0x0000000F];
2111 *p++ = hexdigit[(ch >> 12) & 0x0000000F];
2112 *p++ = hexdigit[(ch >> 8) & 0x0000000F];
2113 *p++ = hexdigit[(ch >> 4) & 0x0000000F];
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002114 *p++ = hexdigit[ch & 0x0000000F];
2115 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002116 }
Thomas Wouters89f507f2006-12-13 04:49:30 +00002117#else
2118 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002119 else if (ch >= 0xD800 && ch < 0xDC00) {
2120 Py_UNICODE ch2;
2121 Py_UCS4 ucs;
Tim Petersced69f82003-09-16 20:30:58 +00002122
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002123 ch2 = *s++;
2124 size--;
2125 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
2126 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
2127 *p++ = '\\';
2128 *p++ = 'U';
2129 *p++ = hexdigit[(ucs >> 28) & 0x0000000F];
2130 *p++ = hexdigit[(ucs >> 24) & 0x0000000F];
2131 *p++ = hexdigit[(ucs >> 20) & 0x0000000F];
2132 *p++ = hexdigit[(ucs >> 16) & 0x0000000F];
2133 *p++ = hexdigit[(ucs >> 12) & 0x0000000F];
2134 *p++ = hexdigit[(ucs >> 8) & 0x0000000F];
2135 *p++ = hexdigit[(ucs >> 4) & 0x0000000F];
2136 *p++ = hexdigit[ucs & 0x0000000F];
2137 continue;
2138 }
2139 /* Fall through: isolated surrogates are copied as-is */
2140 s--;
2141 size++;
2142 }
Thomas Wouters89f507f2006-12-13 04:49:30 +00002143#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002144
Guido van Rossumd57fd912000-03-10 22:53:23 +00002145 /* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002146 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002147 *p++ = '\\';
2148 *p++ = 'u';
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002149 *p++ = hexdigit[(ch >> 12) & 0x000F];
2150 *p++ = hexdigit[(ch >> 8) & 0x000F];
2151 *p++ = hexdigit[(ch >> 4) & 0x000F];
2152 *p++ = hexdigit[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00002153 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002154
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00002155 /* Map special whitespace to '\t', \n', '\r' */
2156 else if (ch == '\t') {
2157 *p++ = '\\';
2158 *p++ = 't';
2159 }
2160 else if (ch == '\n') {
2161 *p++ = '\\';
2162 *p++ = 'n';
2163 }
2164 else if (ch == '\r') {
2165 *p++ = '\\';
2166 *p++ = 'r';
2167 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002168
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00002169 /* Map non-printable US ASCII to '\xhh' */
Marc-André Lemburg11326de2001-11-28 12:56:20 +00002170 else if (ch < ' ' || ch >= 0x7F) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002171 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00002172 *p++ = 'x';
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002173 *p++ = hexdigit[(ch >> 4) & 0x000F];
2174 *p++ = hexdigit[ch & 0x000F];
Tim Petersced69f82003-09-16 20:30:58 +00002175 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002176
Guido van Rossumd57fd912000-03-10 22:53:23 +00002177 /* Copy everything else as-is */
2178 else
2179 *p++ = (char) ch;
2180 }
2181 if (quotes)
Guido van Rossum572dbf82007-04-27 23:53:51 +00002182 *p++ = PyString_AS_STRING(repr)[0];
Guido van Rossumd57fd912000-03-10 22:53:23 +00002183
2184 *p = '\0';
Tim Peters5de98422002-04-27 18:44:32 +00002185 _PyString_Resize(&repr, p - PyString_AS_STRING(repr));
Guido van Rossumd57fd912000-03-10 22:53:23 +00002186 return repr;
2187}
2188
2189PyObject *PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002190 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002191{
2192 return unicodeescape_string(s, size, 0);
2193}
2194
2195PyObject *PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
2196{
2197 if (!PyUnicode_Check(unicode)) {
2198 PyErr_BadArgument();
2199 return NULL;
2200 }
2201 return PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
2202 PyUnicode_GET_SIZE(unicode));
2203}
2204
2205/* --- Raw Unicode Escape Codec ------------------------------------------- */
2206
2207PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002208 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00002209 const char *errors)
2210{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002211 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002212 Py_ssize_t startinpos;
2213 Py_ssize_t endinpos;
2214 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002215 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002216 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002217 const char *end;
2218 const char *bs;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002219 PyObject *errorHandler = NULL;
2220 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00002221
Guido van Rossumd57fd912000-03-10 22:53:23 +00002222 /* Escaped strings will always be longer than the resulting
2223 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002224 length after conversion to the true value. (But decoding error
2225 handler might have to resize the string) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002226 v = _PyUnicode_New(size);
2227 if (v == NULL)
2228 goto onError;
2229 if (size == 0)
2230 return (PyObject *)v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002231 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002232 end = s + size;
2233 while (s < end) {
2234 unsigned char c;
Martin v. Löwis047c05e2002-03-21 08:55:28 +00002235 Py_UCS4 x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002236 int i;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002237 int count;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002238
2239 /* Non-escape characters are interpreted as Unicode ordinals */
2240 if (*s != '\\') {
2241 *p++ = (unsigned char)*s++;
2242 continue;
2243 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002244 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002245
2246 /* \u-escapes are only interpreted iff the number of leading
2247 backslashes if odd */
2248 bs = s;
2249 for (;s < end;) {
2250 if (*s != '\\')
2251 break;
2252 *p++ = (unsigned char)*s++;
2253 }
2254 if (((s - bs) & 1) == 0 ||
2255 s >= end ||
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002256 (*s != 'u' && *s != 'U')) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002257 continue;
2258 }
2259 p--;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002260 count = *s=='u' ? 4 : 8;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002261 s++;
2262
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002263 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002264 outpos = p-PyUnicode_AS_UNICODE(v);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002265 for (x = 0, i = 0; i < count; ++i, ++s) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002266 c = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002267 if (!isxdigit(c)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002268 endinpos = s-starts;
2269 if (unicode_decode_call_errorhandler(
2270 errors, &errorHandler,
2271 "rawunicodeescape", "truncated \\uXXXX",
2272 starts, size, &startinpos, &endinpos, &exc, &s,
2273 (PyObject **)&v, &outpos, &p))
Guido van Rossumd57fd912000-03-10 22:53:23 +00002274 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002275 goto nextByte;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002276 }
2277 x = (x<<4) & ~0xF;
2278 if (c >= '0' && c <= '9')
2279 x += c - '0';
2280 else if (c >= 'a' && c <= 'f')
2281 x += 10 + c - 'a';
2282 else
2283 x += 10 + c - 'A';
2284 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002285#ifndef Py_UNICODE_WIDE
2286 if (x > 0x10000) {
2287 if (unicode_decode_call_errorhandler(
2288 errors, &errorHandler,
2289 "rawunicodeescape", "\\Uxxxxxxxx out of range",
2290 starts, size, &startinpos, &endinpos, &exc, &s,
2291 (PyObject **)&v, &outpos, &p))
2292 goto onError;
2293 }
2294#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002295 *p++ = x;
2296 nextByte:
2297 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002298 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002299 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002300 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002301 Py_XDECREF(errorHandler);
2302 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002303 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00002304
Guido van Rossumd57fd912000-03-10 22:53:23 +00002305 onError:
2306 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002307 Py_XDECREF(errorHandler);
2308 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002309 return NULL;
2310}
2311
2312PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002313 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002314{
2315 PyObject *repr;
2316 char *p;
2317 char *q;
2318
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00002319 static const char *hexdigit = "0123456789abcdef";
Guido van Rossumd57fd912000-03-10 22:53:23 +00002320
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002321#ifdef Py_UNICODE_WIDE
2322 repr = PyString_FromStringAndSize(NULL, 10 * size);
2323#else
Guido van Rossumd57fd912000-03-10 22:53:23 +00002324 repr = PyString_FromStringAndSize(NULL, 6 * size);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002325#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00002326 if (repr == NULL)
2327 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00002328 if (size == 0)
2329 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002330
2331 p = q = PyString_AS_STRING(repr);
2332 while (size-- > 0) {
2333 Py_UNICODE ch = *s++;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002334#ifdef Py_UNICODE_WIDE
2335 /* Map 32-bit characters to '\Uxxxxxxxx' */
2336 if (ch >= 0x10000) {
2337 *p++ = '\\';
2338 *p++ = 'U';
2339 *p++ = hexdigit[(ch >> 28) & 0xf];
2340 *p++ = hexdigit[(ch >> 24) & 0xf];
2341 *p++ = hexdigit[(ch >> 20) & 0xf];
2342 *p++ = hexdigit[(ch >> 16) & 0xf];
2343 *p++ = hexdigit[(ch >> 12) & 0xf];
2344 *p++ = hexdigit[(ch >> 8) & 0xf];
2345 *p++ = hexdigit[(ch >> 4) & 0xf];
2346 *p++ = hexdigit[ch & 15];
Tim Petersced69f82003-09-16 20:30:58 +00002347 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002348 else
2349#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00002350 /* Map 16-bit characters to '\uxxxx' */
2351 if (ch >= 256) {
2352 *p++ = '\\';
2353 *p++ = 'u';
2354 *p++ = hexdigit[(ch >> 12) & 0xf];
2355 *p++ = hexdigit[(ch >> 8) & 0xf];
2356 *p++ = hexdigit[(ch >> 4) & 0xf];
2357 *p++ = hexdigit[ch & 15];
2358 }
2359 /* Copy everything else as-is */
2360 else
2361 *p++ = (char) ch;
2362 }
2363 *p = '\0';
Tim Peters5de98422002-04-27 18:44:32 +00002364 _PyString_Resize(&repr, p - q);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002365 return repr;
2366}
2367
2368PyObject *PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
2369{
2370 if (!PyUnicode_Check(unicode)) {
2371 PyErr_BadArgument();
2372 return NULL;
2373 }
2374 return PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
2375 PyUnicode_GET_SIZE(unicode));
2376}
2377
Walter Dörwalda47d1c02005-08-30 10:23:14 +00002378/* --- Unicode Internal Codec ------------------------------------------- */
2379
2380PyObject *_PyUnicode_DecodeUnicodeInternal(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002381 Py_ssize_t size,
Walter Dörwalda47d1c02005-08-30 10:23:14 +00002382 const char *errors)
2383{
2384 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002385 Py_ssize_t startinpos;
2386 Py_ssize_t endinpos;
2387 Py_ssize_t outpos;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00002388 PyUnicodeObject *v;
2389 Py_UNICODE *p;
2390 const char *end;
2391 const char *reason;
2392 PyObject *errorHandler = NULL;
2393 PyObject *exc = NULL;
2394
Neal Norwitzd43069c2006-01-08 01:12:10 +00002395#ifdef Py_UNICODE_WIDE
2396 Py_UNICODE unimax = PyUnicode_GetMax();
2397#endif
2398
Thomas Wouters89f507f2006-12-13 04:49:30 +00002399 /* XXX overflow detection missing */
Walter Dörwalda47d1c02005-08-30 10:23:14 +00002400 v = _PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE);
2401 if (v == NULL)
2402 goto onError;
2403 if (PyUnicode_GetSize((PyObject *)v) == 0)
2404 return (PyObject *)v;
2405 p = PyUnicode_AS_UNICODE(v);
2406 end = s + size;
2407
2408 while (s < end) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00002409 memcpy(p, s, sizeof(Py_UNICODE));
Walter Dörwalda47d1c02005-08-30 10:23:14 +00002410 /* We have to sanity check the raw data, otherwise doom looms for
2411 some malformed UCS-4 data. */
2412 if (
2413 #ifdef Py_UNICODE_WIDE
2414 *p > unimax || *p < 0 ||
2415 #endif
2416 end-s < Py_UNICODE_SIZE
2417 )
2418 {
2419 startinpos = s - starts;
2420 if (end-s < Py_UNICODE_SIZE) {
2421 endinpos = end-starts;
2422 reason = "truncated input";
2423 }
2424 else {
2425 endinpos = s - starts + Py_UNICODE_SIZE;
2426 reason = "illegal code point (> 0x10FFFF)";
2427 }
2428 outpos = p - PyUnicode_AS_UNICODE(v);
2429 if (unicode_decode_call_errorhandler(
2430 errors, &errorHandler,
2431 "unicode_internal", reason,
2432 starts, size, &startinpos, &endinpos, &exc, &s,
2433 (PyObject **)&v, &outpos, &p)) {
2434 goto onError;
2435 }
2436 }
2437 else {
2438 p++;
2439 s += Py_UNICODE_SIZE;
2440 }
2441 }
2442
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002443 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00002444 goto onError;
2445 Py_XDECREF(errorHandler);
2446 Py_XDECREF(exc);
2447 return (PyObject *)v;
2448
2449 onError:
2450 Py_XDECREF(v);
2451 Py_XDECREF(errorHandler);
2452 Py_XDECREF(exc);
2453 return NULL;
2454}
2455
Guido van Rossumd57fd912000-03-10 22:53:23 +00002456/* --- Latin-1 Codec ------------------------------------------------------ */
2457
2458PyObject *PyUnicode_DecodeLatin1(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002459 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00002460 const char *errors)
2461{
2462 PyUnicodeObject *v;
2463 Py_UNICODE *p;
Tim Petersced69f82003-09-16 20:30:58 +00002464
Guido van Rossumd57fd912000-03-10 22:53:23 +00002465 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002466 if (size == 1) {
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002467 Py_UNICODE r = *(unsigned char*)s;
2468 return PyUnicode_FromUnicode(&r, 1);
2469 }
2470
Guido van Rossumd57fd912000-03-10 22:53:23 +00002471 v = _PyUnicode_New(size);
2472 if (v == NULL)
2473 goto onError;
2474 if (size == 0)
2475 return (PyObject *)v;
2476 p = PyUnicode_AS_UNICODE(v);
2477 while (size-- > 0)
2478 *p++ = (unsigned char)*s++;
2479 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00002480
Guido van Rossumd57fd912000-03-10 22:53:23 +00002481 onError:
2482 Py_XDECREF(v);
2483 return NULL;
2484}
2485
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002486/* create or adjust a UnicodeEncodeError */
2487static void make_encode_exception(PyObject **exceptionObject,
2488 const char *encoding,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002489 const Py_UNICODE *unicode, Py_ssize_t size,
2490 Py_ssize_t startpos, Py_ssize_t endpos,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002491 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002492{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002493 if (*exceptionObject == NULL) {
2494 *exceptionObject = PyUnicodeEncodeError_Create(
2495 encoding, unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002496 }
2497 else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002498 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
2499 goto onError;
2500 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
2501 goto onError;
2502 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
2503 goto onError;
2504 return;
2505 onError:
2506 Py_DECREF(*exceptionObject);
2507 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002508 }
2509}
2510
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002511/* raises a UnicodeEncodeError */
2512static void raise_encode_exception(PyObject **exceptionObject,
2513 const char *encoding,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002514 const Py_UNICODE *unicode, Py_ssize_t size,
2515 Py_ssize_t startpos, Py_ssize_t endpos,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002516 const char *reason)
2517{
2518 make_encode_exception(exceptionObject,
2519 encoding, unicode, size, startpos, endpos, reason);
2520 if (*exceptionObject != NULL)
2521 PyCodec_StrictErrors(*exceptionObject);
2522}
2523
2524/* error handling callback helper:
2525 build arguments, call the callback and check the arguments,
2526 put the result into newpos and return the replacement string, which
2527 has to be freed by the caller */
2528static PyObject *unicode_encode_call_errorhandler(const char *errors,
2529 PyObject **errorHandler,
2530 const char *encoding, const char *reason,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002531 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
2532 Py_ssize_t startpos, Py_ssize_t endpos,
2533 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002534{
Martin v. Löwis18e16552006-02-15 17:27:45 +00002535 static char *argparse = "O!n;encoding error handler must return (unicode, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002536
2537 PyObject *restuple;
2538 PyObject *resunicode;
2539
2540 if (*errorHandler == NULL) {
2541 *errorHandler = PyCodec_LookupError(errors);
2542 if (*errorHandler == NULL)
2543 return NULL;
2544 }
2545
2546 make_encode_exception(exceptionObject,
2547 encoding, unicode, size, startpos, endpos, reason);
2548 if (*exceptionObject == NULL)
2549 return NULL;
2550
2551 restuple = PyObject_CallFunctionObjArgs(
2552 *errorHandler, *exceptionObject, NULL);
2553 if (restuple == NULL)
2554 return NULL;
2555 if (!PyTuple_Check(restuple)) {
2556 PyErr_Format(PyExc_TypeError, &argparse[4]);
2557 Py_DECREF(restuple);
2558 return NULL;
2559 }
2560 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
2561 &resunicode, newpos)) {
2562 Py_DECREF(restuple);
2563 return NULL;
2564 }
2565 if (*newpos<0)
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00002566 *newpos = size+*newpos;
2567 if (*newpos<0 || *newpos>size) {
Martin v. Löwis2c95cc62006-02-16 06:54:25 +00002568 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00002569 Py_DECREF(restuple);
2570 return NULL;
2571 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002572 Py_INCREF(resunicode);
2573 Py_DECREF(restuple);
2574 return resunicode;
2575}
2576
2577static PyObject *unicode_encode_ucs1(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002578 Py_ssize_t size,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002579 const char *errors,
2580 int limit)
2581{
2582 /* output object */
2583 PyObject *res;
2584 /* pointers to the beginning and end+1 of input */
2585 const Py_UNICODE *startp = p;
2586 const Py_UNICODE *endp = p + size;
2587 /* pointer to the beginning of the unencodable characters */
2588 /* const Py_UNICODE *badp = NULL; */
2589 /* pointer into the output */
2590 char *str;
2591 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00002592 Py_ssize_t respos = 0;
2593 Py_ssize_t ressize;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002594 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
2595 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002596 PyObject *errorHandler = NULL;
2597 PyObject *exc = NULL;
2598 /* the following variable is used for caching string comparisons
2599 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
2600 int known_errorHandler = -1;
2601
2602 /* allocate enough for a simple encoding without
2603 replacements, if we need more, we'll resize */
Guido van Rossumf15a29f2007-05-04 00:41:39 +00002604 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002605 if (res == NULL)
2606 goto onError;
2607 if (size == 0)
2608 return res;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00002609 str = PyBytes_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002610 ressize = size;
2611
2612 while (p<endp) {
2613 Py_UNICODE c = *p;
2614
2615 /* can we encode this? */
2616 if (c<limit) {
2617 /* no overflow check, because we know that the space is enough */
2618 *str++ = (char)c;
2619 ++p;
2620 }
2621 else {
Martin v. Löwis18e16552006-02-15 17:27:45 +00002622 Py_ssize_t unicodepos = p-startp;
2623 Py_ssize_t requiredsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002624 PyObject *repunicode;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002625 Py_ssize_t repsize;
2626 Py_ssize_t newpos;
2627 Py_ssize_t respos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002628 Py_UNICODE *uni2;
2629 /* startpos for collecting unencodable chars */
2630 const Py_UNICODE *collstart = p;
2631 const Py_UNICODE *collend = p;
2632 /* find all unecodable characters */
2633 while ((collend < endp) && ((*collend)>=limit))
2634 ++collend;
2635 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
2636 if (known_errorHandler==-1) {
2637 if ((errors==NULL) || (!strcmp(errors, "strict")))
2638 known_errorHandler = 1;
2639 else if (!strcmp(errors, "replace"))
2640 known_errorHandler = 2;
2641 else if (!strcmp(errors, "ignore"))
2642 known_errorHandler = 3;
2643 else if (!strcmp(errors, "xmlcharrefreplace"))
2644 known_errorHandler = 4;
2645 else
2646 known_errorHandler = 0;
2647 }
2648 switch (known_errorHandler) {
2649 case 1: /* strict */
2650 raise_encode_exception(&exc, encoding, startp, size, collstart-startp, collend-startp, reason);
2651 goto onError;
2652 case 2: /* replace */
2653 while (collstart++<collend)
2654 *str++ = '?'; /* fall through */
2655 case 3: /* ignore */
2656 p = collend;
2657 break;
2658 case 4: /* xmlcharrefreplace */
Guido van Rossumf15a29f2007-05-04 00:41:39 +00002659 respos = str - PyBytes_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002660 /* determine replacement size (temporarily (mis)uses p) */
2661 for (p = collstart, repsize = 0; p < collend; ++p) {
2662 if (*p<10)
2663 repsize += 2+1+1;
2664 else if (*p<100)
2665 repsize += 2+2+1;
2666 else if (*p<1000)
2667 repsize += 2+3+1;
2668 else if (*p<10000)
2669 repsize += 2+4+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00002670#ifndef Py_UNICODE_WIDE
2671 else
2672 repsize += 2+5+1;
2673#else
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002674 else if (*p<100000)
2675 repsize += 2+5+1;
2676 else if (*p<1000000)
2677 repsize += 2+6+1;
2678 else
2679 repsize += 2+7+1;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002680#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002681 }
2682 requiredsize = respos+repsize+(endp-collend);
2683 if (requiredsize > ressize) {
2684 if (requiredsize<2*ressize)
2685 requiredsize = 2*ressize;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00002686 if (PyBytes_Resize(res, requiredsize))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002687 goto onError;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00002688 str = PyBytes_AS_STRING(res) + respos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002689 ressize = requiredsize;
2690 }
2691 /* generate replacement (temporarily (mis)uses p) */
2692 for (p = collstart; p < collend; ++p) {
2693 str += sprintf(str, "&#%d;", (int)*p);
2694 }
2695 p = collend;
2696 break;
2697 default:
2698 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
2699 encoding, reason, startp, size, &exc,
2700 collstart-startp, collend-startp, &newpos);
2701 if (repunicode == NULL)
2702 goto onError;
2703 /* need more space? (at least enough for what we
2704 have+the replacement+the rest of the string, so
2705 we won't have to check space for encodable characters) */
Guido van Rossumf15a29f2007-05-04 00:41:39 +00002706 respos = str - PyBytes_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002707 repsize = PyUnicode_GET_SIZE(repunicode);
2708 requiredsize = respos+repsize+(endp-collend);
2709 if (requiredsize > ressize) {
2710 if (requiredsize<2*ressize)
2711 requiredsize = 2*ressize;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00002712 if (PyBytes_Resize(res, requiredsize)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002713 Py_DECREF(repunicode);
2714 goto onError;
2715 }
Guido van Rossumf15a29f2007-05-04 00:41:39 +00002716 str = PyBytes_AS_STRING(res) + respos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002717 ressize = requiredsize;
2718 }
2719 /* check if there is anything unencodable in the replacement
2720 and copy it to the output */
2721 for (uni2 = PyUnicode_AS_UNICODE(repunicode);repsize-->0; ++uni2, ++str) {
2722 c = *uni2;
2723 if (c >= limit) {
2724 raise_encode_exception(&exc, encoding, startp, size,
2725 unicodepos, unicodepos+1, reason);
2726 Py_DECREF(repunicode);
2727 goto onError;
2728 }
2729 *str = (char)c;
2730 }
2731 p = startp + newpos;
2732 Py_DECREF(repunicode);
2733 }
2734 }
2735 }
2736 /* Resize if we allocated to much */
Guido van Rossumf15a29f2007-05-04 00:41:39 +00002737 respos = str - PyBytes_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002738 if (respos<ressize)
2739 /* If this falls res will be NULL */
Guido van Rossumf15a29f2007-05-04 00:41:39 +00002740 PyBytes_Resize(res, respos);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002741 Py_XDECREF(errorHandler);
2742 Py_XDECREF(exc);
2743 return res;
2744
2745 onError:
2746 Py_XDECREF(res);
2747 Py_XDECREF(errorHandler);
2748 Py_XDECREF(exc);
2749 return NULL;
2750}
2751
Guido van Rossumd57fd912000-03-10 22:53:23 +00002752PyObject *PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002753 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00002754 const char *errors)
2755{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002756 return unicode_encode_ucs1(p, size, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002757}
2758
2759PyObject *PyUnicode_AsLatin1String(PyObject *unicode)
2760{
2761 if (!PyUnicode_Check(unicode)) {
2762 PyErr_BadArgument();
2763 return NULL;
2764 }
2765 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
2766 PyUnicode_GET_SIZE(unicode),
2767 NULL);
2768}
2769
2770/* --- 7-bit ASCII Codec -------------------------------------------------- */
2771
Guido van Rossumd57fd912000-03-10 22:53:23 +00002772PyObject *PyUnicode_DecodeASCII(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002773 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00002774 const char *errors)
2775{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002776 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002777 PyUnicodeObject *v;
2778 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002779 Py_ssize_t startinpos;
2780 Py_ssize_t endinpos;
2781 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002782 const char *e;
2783 PyObject *errorHandler = NULL;
2784 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00002785
Guido van Rossumd57fd912000-03-10 22:53:23 +00002786 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002787 if (size == 1 && *(unsigned char*)s < 128) {
2788 Py_UNICODE r = *(unsigned char*)s;
2789 return PyUnicode_FromUnicode(&r, 1);
2790 }
Tim Petersced69f82003-09-16 20:30:58 +00002791
Guido van Rossumd57fd912000-03-10 22:53:23 +00002792 v = _PyUnicode_New(size);
2793 if (v == NULL)
2794 goto onError;
2795 if (size == 0)
2796 return (PyObject *)v;
2797 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002798 e = s + size;
2799 while (s < e) {
2800 register unsigned char c = (unsigned char)*s;
2801 if (c < 128) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002802 *p++ = c;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002803 ++s;
2804 }
2805 else {
2806 startinpos = s-starts;
2807 endinpos = startinpos + 1;
Jeremy Hyltond8082792003-09-16 19:41:39 +00002808 outpos = p - (Py_UNICODE *)PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002809 if (unicode_decode_call_errorhandler(
2810 errors, &errorHandler,
2811 "ascii", "ordinal not in range(128)",
2812 starts, size, &startinpos, &endinpos, &exc, &s,
2813 (PyObject **)&v, &outpos, &p))
Guido van Rossumd57fd912000-03-10 22:53:23 +00002814 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002815 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002816 }
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002817 if (p - PyUnicode_AS_UNICODE(v) < PyString_GET_SIZE(v))
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002818 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002819 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002820 Py_XDECREF(errorHandler);
2821 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002822 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00002823
Guido van Rossumd57fd912000-03-10 22:53:23 +00002824 onError:
2825 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002826 Py_XDECREF(errorHandler);
2827 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002828 return NULL;
2829}
2830
Guido van Rossumd57fd912000-03-10 22:53:23 +00002831PyObject *PyUnicode_EncodeASCII(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002832 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00002833 const char *errors)
2834{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002835 return unicode_encode_ucs1(p, size, errors, 128);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002836}
2837
2838PyObject *PyUnicode_AsASCIIString(PyObject *unicode)
2839{
2840 if (!PyUnicode_Check(unicode)) {
2841 PyErr_BadArgument();
2842 return NULL;
2843 }
2844 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
2845 PyUnicode_GET_SIZE(unicode),
2846 NULL);
2847}
2848
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00002849#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Guido van Rossum2ea3e142000-03-31 17:24:09 +00002850
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002851/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00002852
Thomas Wouters0e3f5912006-08-11 14:57:12 +00002853#if SIZEOF_INT < SIZEOF_SSIZE_T
2854#define NEED_RETRY
2855#endif
2856
2857/* XXX This code is limited to "true" double-byte encodings, as
2858 a) it assumes an incomplete character consists of a single byte, and
2859 b) IsDBCSLeadByte (probably) does not work for non-DBCS multi-byte
2860 encodings, see IsDBCSLeadByteEx documentation. */
2861
2862static int is_dbcs_lead_byte(const char *s, int offset)
2863{
2864 const char *curr = s + offset;
2865
2866 if (IsDBCSLeadByte(*curr)) {
2867 const char *prev = CharPrev(s, curr);
2868 return (prev == curr) || !IsDBCSLeadByte(*prev) || (curr - prev == 2);
2869 }
2870 return 0;
2871}
2872
2873/*
2874 * Decode MBCS string into unicode object. If 'final' is set, converts
2875 * trailing lead-byte too. Returns consumed size if succeed, -1 otherwise.
2876 */
2877static int decode_mbcs(PyUnicodeObject **v,
2878 const char *s, /* MBCS string */
2879 int size, /* sizeof MBCS string */
2880 int final)
2881{
2882 Py_UNICODE *p;
2883 Py_ssize_t n = 0;
2884 int usize = 0;
2885
2886 assert(size >= 0);
2887
2888 /* Skip trailing lead-byte unless 'final' is set */
2889 if (!final && size >= 1 && is_dbcs_lead_byte(s, size - 1))
2890 --size;
2891
2892 /* First get the size of the result */
2893 if (size > 0) {
2894 usize = MultiByteToWideChar(CP_ACP, 0, s, size, NULL, 0);
2895 if (usize == 0) {
2896 PyErr_SetFromWindowsErrWithFilename(0, NULL);
2897 return -1;
2898 }
2899 }
2900
2901 if (*v == NULL) {
2902 /* Create unicode object */
2903 *v = _PyUnicode_New(usize);
2904 if (*v == NULL)
2905 return -1;
2906 }
2907 else {
2908 /* Extend unicode object */
2909 n = PyUnicode_GET_SIZE(*v);
2910 if (_PyUnicode_Resize(v, n + usize) < 0)
2911 return -1;
2912 }
2913
2914 /* Do the conversion */
2915 if (size > 0) {
2916 p = PyUnicode_AS_UNICODE(*v) + n;
2917 if (0 == MultiByteToWideChar(CP_ACP, 0, s, size, p, usize)) {
2918 PyErr_SetFromWindowsErrWithFilename(0, NULL);
2919 return -1;
2920 }
2921 }
2922
2923 return size;
2924}
2925
2926PyObject *PyUnicode_DecodeMBCSStateful(const char *s,
2927 Py_ssize_t size,
2928 const char *errors,
2929 Py_ssize_t *consumed)
2930{
2931 PyUnicodeObject *v = NULL;
2932 int done;
2933
2934 if (consumed)
2935 *consumed = 0;
2936
2937#ifdef NEED_RETRY
2938 retry:
2939 if (size > INT_MAX)
2940 done = decode_mbcs(&v, s, INT_MAX, 0);
2941 else
2942#endif
2943 done = decode_mbcs(&v, s, (int)size, !consumed);
2944
2945 if (done < 0) {
2946 Py_XDECREF(v);
2947 return NULL;
2948 }
2949
2950 if (consumed)
2951 *consumed += done;
2952
2953#ifdef NEED_RETRY
2954 if (size > INT_MAX) {
2955 s += done;
2956 size -= done;
2957 goto retry;
2958 }
2959#endif
2960
2961 return (PyObject *)v;
2962}
2963
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002964PyObject *PyUnicode_DecodeMBCS(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002965 Py_ssize_t size,
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002966 const char *errors)
2967{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00002968 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
2969}
2970
2971/*
2972 * Convert unicode into string object (MBCS).
2973 * Returns 0 if succeed, -1 otherwise.
2974 */
2975static int encode_mbcs(PyObject **repr,
2976 const Py_UNICODE *p, /* unicode */
2977 int size) /* size of unicode */
2978{
2979 int mbcssize = 0;
2980 Py_ssize_t n = 0;
2981
2982 assert(size >= 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002983
2984 /* First get the size of the result */
Thomas Wouters0e3f5912006-08-11 14:57:12 +00002985 if (size > 0) {
2986 mbcssize = WideCharToMultiByte(CP_ACP, 0, p, size, NULL, 0, NULL, NULL);
2987 if (mbcssize == 0) {
2988 PyErr_SetFromWindowsErrWithFilename(0, NULL);
2989 return -1;
2990 }
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002991 }
2992
Thomas Wouters0e3f5912006-08-11 14:57:12 +00002993 if (*repr == NULL) {
2994 /* Create string object */
Guido van Rossumf15a29f2007-05-04 00:41:39 +00002995 *repr = PyBytes_FromStringAndSize(NULL, mbcssize);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00002996 if (*repr == NULL)
2997 return -1;
2998 }
2999 else {
3000 /* Extend string object */
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003001 n = PyBytes_Size(*repr);
3002 if (PyBytes_Resize(*repr, n + mbcssize) < 0)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003003 return -1;
3004 }
3005
3006 /* Do the conversion */
3007 if (size > 0) {
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003008 char *s = PyBytes_AS_STRING(*repr) + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003009 if (0 == WideCharToMultiByte(CP_ACP, 0, p, size, s, mbcssize, NULL, NULL)) {
3010 PyErr_SetFromWindowsErrWithFilename(0, NULL);
3011 return -1;
3012 }
3013 }
3014
3015 return 0;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003016}
3017
3018PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003019 Py_ssize_t size,
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003020 const char *errors)
3021{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003022 PyObject *repr = NULL;
3023 int ret;
Guido van Rossum03e29f12000-05-04 15:52:20 +00003024
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003025#ifdef NEED_RETRY
3026 retry:
3027 if (size > INT_MAX)
3028 ret = encode_mbcs(&repr, p, INT_MAX);
3029 else
3030#endif
3031 ret = encode_mbcs(&repr, p, (int)size);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003032
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003033 if (ret < 0) {
3034 Py_XDECREF(repr);
3035 return NULL;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003036 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003037
3038#ifdef NEED_RETRY
3039 if (size > INT_MAX) {
3040 p += INT_MAX;
3041 size -= INT_MAX;
3042 goto retry;
3043 }
3044#endif
3045
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003046 return repr;
3047}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00003048
Mark Hammond0ccda1e2003-07-01 00:13:27 +00003049PyObject *PyUnicode_AsMBCSString(PyObject *unicode)
3050{
3051 if (!PyUnicode_Check(unicode)) {
3052 PyErr_BadArgument();
3053 return NULL;
3054 }
3055 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
3056 PyUnicode_GET_SIZE(unicode),
3057 NULL);
3058}
3059
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003060#undef NEED_RETRY
3061
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00003062#endif /* MS_WINDOWS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003063
Guido van Rossumd57fd912000-03-10 22:53:23 +00003064/* --- Character Mapping Codec -------------------------------------------- */
3065
Guido van Rossumd57fd912000-03-10 22:53:23 +00003066PyObject *PyUnicode_DecodeCharmap(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003067 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003068 PyObject *mapping,
3069 const char *errors)
3070{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003071 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003072 Py_ssize_t startinpos;
3073 Py_ssize_t endinpos;
3074 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003075 const char *e;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003076 PyUnicodeObject *v;
3077 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003078 Py_ssize_t extrachars = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003079 PyObject *errorHandler = NULL;
3080 PyObject *exc = NULL;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003081 Py_UNICODE *mapstring = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003082 Py_ssize_t maplen = 0;
Tim Petersced69f82003-09-16 20:30:58 +00003083
Guido van Rossumd57fd912000-03-10 22:53:23 +00003084 /* Default to Latin-1 */
3085 if (mapping == NULL)
3086 return PyUnicode_DecodeLatin1(s, size, errors);
3087
3088 v = _PyUnicode_New(size);
3089 if (v == NULL)
3090 goto onError;
3091 if (size == 0)
3092 return (PyObject *)v;
3093 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003094 e = s + size;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003095 if (PyUnicode_CheckExact(mapping)) {
3096 mapstring = PyUnicode_AS_UNICODE(mapping);
3097 maplen = PyUnicode_GET_SIZE(mapping);
3098 while (s < e) {
3099 unsigned char ch = *s;
3100 Py_UNICODE x = 0xfffe; /* illegal value */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003101
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003102 if (ch < maplen)
3103 x = mapstring[ch];
Guido van Rossumd57fd912000-03-10 22:53:23 +00003104
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003105 if (x == 0xfffe) {
3106 /* undefined mapping */
3107 outpos = p-PyUnicode_AS_UNICODE(v);
3108 startinpos = s-starts;
3109 endinpos = startinpos+1;
3110 if (unicode_decode_call_errorhandler(
3111 errors, &errorHandler,
3112 "charmap", "character maps to <undefined>",
3113 starts, size, &startinpos, &endinpos, &exc, &s,
3114 (PyObject **)&v, &outpos, &p)) {
3115 goto onError;
Marc-André Lemburgec233e52001-01-06 14:59:58 +00003116 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003117 continue;
Marc-André Lemburgec233e52001-01-06 14:59:58 +00003118 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003119 *p++ = x;
3120 ++s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003121 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003122 }
3123 else {
3124 while (s < e) {
3125 unsigned char ch = *s;
3126 PyObject *w, *x;
3127
3128 /* Get mapping (char ordinal -> integer, Unicode char or None) */
3129 w = PyInt_FromLong((long)ch);
3130 if (w == NULL)
3131 goto onError;
3132 x = PyObject_GetItem(mapping, w);
3133 Py_DECREF(w);
3134 if (x == NULL) {
3135 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
3136 /* No mapping found means: mapping is undefined. */
3137 PyErr_Clear();
3138 x = Py_None;
3139 Py_INCREF(x);
3140 } else
3141 goto onError;
3142 }
3143
3144 /* Apply mapping */
3145 if (PyInt_Check(x)) {
3146 long value = PyInt_AS_LONG(x);
3147 if (value < 0 || value > 65535) {
3148 PyErr_SetString(PyExc_TypeError,
3149 "character mapping must be in range(65536)");
3150 Py_DECREF(x);
3151 goto onError;
3152 }
3153 *p++ = (Py_UNICODE)value;
3154 }
3155 else if (x == Py_None) {
3156 /* undefined mapping */
3157 outpos = p-PyUnicode_AS_UNICODE(v);
3158 startinpos = s-starts;
3159 endinpos = startinpos+1;
3160 if (unicode_decode_call_errorhandler(
3161 errors, &errorHandler,
3162 "charmap", "character maps to <undefined>",
3163 starts, size, &startinpos, &endinpos, &exc, &s,
3164 (PyObject **)&v, &outpos, &p)) {
3165 Py_DECREF(x);
3166 goto onError;
3167 }
Walter Dörwaldd4fff172005-11-28 22:15:56 +00003168 Py_DECREF(x);
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003169 continue;
3170 }
3171 else if (PyUnicode_Check(x)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00003172 Py_ssize_t targetsize = PyUnicode_GET_SIZE(x);
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003173
3174 if (targetsize == 1)
3175 /* 1-1 mapping */
3176 *p++ = *PyUnicode_AS_UNICODE(x);
3177
3178 else if (targetsize > 1) {
3179 /* 1-n mapping */
3180 if (targetsize > extrachars) {
3181 /* resize first */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003182 Py_ssize_t oldpos = p - PyUnicode_AS_UNICODE(v);
3183 Py_ssize_t needed = (targetsize - extrachars) + \
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003184 (targetsize << 2);
3185 extrachars += needed;
Thomas Wouters89f507f2006-12-13 04:49:30 +00003186 /* XXX overflow detection missing */
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003187 if (_PyUnicode_Resize(&v,
3188 PyUnicode_GET_SIZE(v) + needed) < 0) {
3189 Py_DECREF(x);
3190 goto onError;
3191 }
3192 p = PyUnicode_AS_UNICODE(v) + oldpos;
3193 }
3194 Py_UNICODE_COPY(p,
3195 PyUnicode_AS_UNICODE(x),
3196 targetsize);
3197 p += targetsize;
3198 extrachars -= targetsize;
3199 }
3200 /* 1-0 mapping: skip the character */
3201 }
3202 else {
3203 /* wrong return value */
3204 PyErr_SetString(PyExc_TypeError,
3205 "character mapping must return integer, None or unicode");
3206 Py_DECREF(x);
3207 goto onError;
3208 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003209 Py_DECREF(x);
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003210 ++s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003211 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003212 }
3213 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003214 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003215 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003216 Py_XDECREF(errorHandler);
3217 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003218 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00003219
Guido van Rossumd57fd912000-03-10 22:53:23 +00003220 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003221 Py_XDECREF(errorHandler);
3222 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003223 Py_XDECREF(v);
3224 return NULL;
3225}
3226
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003227/* Charmap encoding: the lookup table */
3228
3229struct encoding_map{
3230 PyObject_HEAD
3231 unsigned char level1[32];
3232 int count2, count3;
3233 unsigned char level23[1];
3234};
3235
3236static PyObject*
3237encoding_map_size(PyObject *obj, PyObject* args)
3238{
3239 struct encoding_map *map = (struct encoding_map*)obj;
3240 return PyInt_FromLong(sizeof(*map) - 1 + 16*map->count2 +
3241 128*map->count3);
3242}
3243
3244static PyMethodDef encoding_map_methods[] = {
3245 {"size", encoding_map_size, METH_NOARGS,
3246 PyDoc_STR("Return the size (in bytes) of this object") },
3247 { 0 }
3248};
3249
3250static void
3251encoding_map_dealloc(PyObject* o)
3252{
3253 PyObject_FREE(o);
3254}
3255
3256static PyTypeObject EncodingMapType = {
3257 PyObject_HEAD_INIT(NULL)
3258 0, /*ob_size*/
3259 "EncodingMap", /*tp_name*/
3260 sizeof(struct encoding_map), /*tp_basicsize*/
3261 0, /*tp_itemsize*/
3262 /* methods */
3263 encoding_map_dealloc, /*tp_dealloc*/
3264 0, /*tp_print*/
3265 0, /*tp_getattr*/
3266 0, /*tp_setattr*/
3267 0, /*tp_compare*/
3268 0, /*tp_repr*/
3269 0, /*tp_as_number*/
3270 0, /*tp_as_sequence*/
3271 0, /*tp_as_mapping*/
3272 0, /*tp_hash*/
3273 0, /*tp_call*/
3274 0, /*tp_str*/
3275 0, /*tp_getattro*/
3276 0, /*tp_setattro*/
3277 0, /*tp_as_buffer*/
3278 Py_TPFLAGS_DEFAULT, /*tp_flags*/
3279 0, /*tp_doc*/
3280 0, /*tp_traverse*/
3281 0, /*tp_clear*/
3282 0, /*tp_richcompare*/
3283 0, /*tp_weaklistoffset*/
3284 0, /*tp_iter*/
3285 0, /*tp_iternext*/
3286 encoding_map_methods, /*tp_methods*/
3287 0, /*tp_members*/
3288 0, /*tp_getset*/
3289 0, /*tp_base*/
3290 0, /*tp_dict*/
3291 0, /*tp_descr_get*/
3292 0, /*tp_descr_set*/
3293 0, /*tp_dictoffset*/
3294 0, /*tp_init*/
3295 0, /*tp_alloc*/
3296 0, /*tp_new*/
3297 0, /*tp_free*/
3298 0, /*tp_is_gc*/
3299};
3300
3301PyObject*
3302PyUnicode_BuildEncodingMap(PyObject* string)
3303{
3304 Py_UNICODE *decode;
3305 PyObject *result;
3306 struct encoding_map *mresult;
3307 int i;
3308 int need_dict = 0;
3309 unsigned char level1[32];
3310 unsigned char level2[512];
3311 unsigned char *mlevel1, *mlevel2, *mlevel3;
3312 int count2 = 0, count3 = 0;
3313
3314 if (!PyUnicode_Check(string) || PyUnicode_GetSize(string) != 256) {
3315 PyErr_BadArgument();
3316 return NULL;
3317 }
3318 decode = PyUnicode_AS_UNICODE(string);
3319 memset(level1, 0xFF, sizeof level1);
3320 memset(level2, 0xFF, sizeof level2);
3321
3322 /* If there isn't a one-to-one mapping of NULL to \0,
3323 or if there are non-BMP characters, we need to use
3324 a mapping dictionary. */
3325 if (decode[0] != 0)
3326 need_dict = 1;
3327 for (i = 1; i < 256; i++) {
3328 int l1, l2;
3329 if (decode[i] == 0
3330 #ifdef Py_UNICODE_WIDE
3331 || decode[i] > 0xFFFF
3332 #endif
3333 ) {
3334 need_dict = 1;
3335 break;
3336 }
3337 if (decode[i] == 0xFFFE)
3338 /* unmapped character */
3339 continue;
3340 l1 = decode[i] >> 11;
3341 l2 = decode[i] >> 7;
3342 if (level1[l1] == 0xFF)
3343 level1[l1] = count2++;
3344 if (level2[l2] == 0xFF)
3345 level2[l2] = count3++;
3346 }
3347
3348 if (count2 >= 0xFF || count3 >= 0xFF)
3349 need_dict = 1;
3350
3351 if (need_dict) {
3352 PyObject *result = PyDict_New();
3353 PyObject *key, *value;
3354 if (!result)
3355 return NULL;
3356 for (i = 0; i < 256; i++) {
3357 key = value = NULL;
3358 key = PyInt_FromLong(decode[i]);
3359 value = PyInt_FromLong(i);
3360 if (!key || !value)
3361 goto failed1;
3362 if (PyDict_SetItem(result, key, value) == -1)
3363 goto failed1;
3364 Py_DECREF(key);
3365 Py_DECREF(value);
3366 }
3367 return result;
3368 failed1:
3369 Py_XDECREF(key);
3370 Py_XDECREF(value);
3371 Py_DECREF(result);
3372 return NULL;
3373 }
3374
3375 /* Create a three-level trie */
3376 result = PyObject_MALLOC(sizeof(struct encoding_map) +
3377 16*count2 + 128*count3 - 1);
3378 if (!result)
3379 return PyErr_NoMemory();
3380 PyObject_Init(result, &EncodingMapType);
3381 mresult = (struct encoding_map*)result;
3382 mresult->count2 = count2;
3383 mresult->count3 = count3;
3384 mlevel1 = mresult->level1;
3385 mlevel2 = mresult->level23;
3386 mlevel3 = mresult->level23 + 16*count2;
3387 memcpy(mlevel1, level1, 32);
3388 memset(mlevel2, 0xFF, 16*count2);
3389 memset(mlevel3, 0, 128*count3);
3390 count3 = 0;
3391 for (i = 1; i < 256; i++) {
3392 int o1, o2, o3, i2, i3;
3393 if (decode[i] == 0xFFFE)
3394 /* unmapped character */
3395 continue;
3396 o1 = decode[i]>>11;
3397 o2 = (decode[i]>>7) & 0xF;
3398 i2 = 16*mlevel1[o1] + o2;
3399 if (mlevel2[i2] == 0xFF)
3400 mlevel2[i2] = count3++;
3401 o3 = decode[i] & 0x7F;
3402 i3 = 128*mlevel2[i2] + o3;
3403 mlevel3[i3] = i;
3404 }
3405 return result;
3406}
3407
3408static int
3409encoding_map_lookup(Py_UNICODE c, PyObject *mapping)
3410{
3411 struct encoding_map *map = (struct encoding_map*)mapping;
3412 int l1 = c>>11;
3413 int l2 = (c>>7) & 0xF;
3414 int l3 = c & 0x7F;
3415 int i;
3416
3417#ifdef Py_UNICODE_WIDE
3418 if (c > 0xFFFF) {
3419 return -1;
3420 }
3421#endif
3422 if (c == 0)
3423 return 0;
3424 /* level 1*/
3425 i = map->level1[l1];
3426 if (i == 0xFF) {
3427 return -1;
3428 }
3429 /* level 2*/
3430 i = map->level23[16*i+l2];
3431 if (i == 0xFF) {
3432 return -1;
3433 }
3434 /* level 3 */
3435 i = map->level23[16*map->count2 + 128*i + l3];
3436 if (i == 0) {
3437 return -1;
3438 }
3439 return i;
3440}
3441
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003442/* Lookup the character ch in the mapping. If the character
3443 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00003444 error occurred). */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003445static PyObject *charmapencode_lookup(Py_UNICODE c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003446{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003447 PyObject *w = PyInt_FromLong((long)c);
3448 PyObject *x;
3449
3450 if (w == NULL)
3451 return NULL;
3452 x = PyObject_GetItem(mapping, w);
3453 Py_DECREF(w);
3454 if (x == NULL) {
3455 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
3456 /* No mapping found means: mapping is undefined. */
3457 PyErr_Clear();
3458 x = Py_None;
3459 Py_INCREF(x);
3460 return x;
3461 } else
3462 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003463 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00003464 else if (x == Py_None)
3465 return x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003466 else if (PyInt_Check(x)) {
3467 long value = PyInt_AS_LONG(x);
3468 if (value < 0 || value > 255) {
3469 PyErr_SetString(PyExc_TypeError,
3470 "character mapping must be in range(256)");
3471 Py_DECREF(x);
3472 return NULL;
3473 }
3474 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003475 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003476 else if (PyString_Check(x))
3477 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003478 else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003479 /* wrong return value */
3480 PyErr_SetString(PyExc_TypeError,
3481 "character mapping must return integer, None or str");
3482 Py_DECREF(x);
3483 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003484 }
3485}
3486
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003487static int
3488charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
3489{
3490 Py_ssize_t outsize = PyString_GET_SIZE(*outobj);
3491 /* exponentially overallocate to minimize reallocations */
3492 if (requiredsize < 2*outsize)
3493 requiredsize = 2*outsize;
3494 if (_PyString_Resize(outobj, requiredsize)) {
3495 return 0;
3496 }
3497 return 1;
3498}
3499
3500typedef enum charmapencode_result {
3501 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
3502}charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003503/* lookup the character, put the result in the output string and adjust
3504 various state variables. Reallocate the output string if not enough
3505 space is available. Return a new reference to the object that
3506 was put in the output buffer, or Py_None, if the mapping was undefined
3507 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00003508 reallocation error occurred. The caller must decref the result */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003509static
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003510charmapencode_result charmapencode_output(Py_UNICODE c, PyObject *mapping,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003511 PyObject **outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003512{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003513 PyObject *rep;
3514 char *outstart;
3515 Py_ssize_t outsize = PyString_GET_SIZE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003516
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003517 if (mapping->ob_type == &EncodingMapType) {
3518 int res = encoding_map_lookup(c, mapping);
3519 Py_ssize_t requiredsize = *outpos+1;
3520 if (res == -1)
3521 return enc_FAILED;
3522 if (outsize<requiredsize)
3523 if (!charmapencode_resize(outobj, outpos, requiredsize))
3524 return enc_EXCEPTION;
3525 outstart = PyString_AS_STRING(*outobj);
3526 outstart[(*outpos)++] = (char)res;
3527 return enc_SUCCESS;
3528 }
3529
3530 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003531 if (rep==NULL)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003532 return enc_EXCEPTION;
3533 else if (rep==Py_None) {
3534 Py_DECREF(rep);
3535 return enc_FAILED;
3536 } else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003537 if (PyInt_Check(rep)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00003538 Py_ssize_t requiredsize = *outpos+1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003539 if (outsize<requiredsize)
3540 if (!charmapencode_resize(outobj, outpos, requiredsize)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003541 Py_DECREF(rep);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003542 return enc_EXCEPTION;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003543 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003544 outstart = PyString_AS_STRING(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003545 outstart[(*outpos)++] = (char)PyInt_AS_LONG(rep);
3546 }
3547 else {
3548 const char *repchars = PyString_AS_STRING(rep);
Martin v. Löwis18e16552006-02-15 17:27:45 +00003549 Py_ssize_t repsize = PyString_GET_SIZE(rep);
3550 Py_ssize_t requiredsize = *outpos+repsize;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003551 if (outsize<requiredsize)
3552 if (!charmapencode_resize(outobj, outpos, requiredsize)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003553 Py_DECREF(rep);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003554 return enc_EXCEPTION;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003555 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003556 outstart = PyString_AS_STRING(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003557 memcpy(outstart + *outpos, repchars, repsize);
3558 *outpos += repsize;
3559 }
3560 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003561 Py_DECREF(rep);
3562 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003563}
3564
3565/* handle an error in PyUnicode_EncodeCharmap
3566 Return 0 on success, -1 on error */
3567static
3568int charmap_encoding_error(
Martin v. Löwis18e16552006-02-15 17:27:45 +00003569 const Py_UNICODE *p, Py_ssize_t size, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003570 PyObject **exceptionObject,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00003571 int *known_errorHandler, PyObject **errorHandler, const char *errors,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003572 PyObject **res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003573{
3574 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003575 Py_ssize_t repsize;
3576 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003577 Py_UNICODE *uni2;
3578 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003579 Py_ssize_t collstartpos = *inpos;
3580 Py_ssize_t collendpos = *inpos+1;
3581 Py_ssize_t collpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003582 char *encoding = "charmap";
3583 char *reason = "character maps to <undefined>";
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003584 charmapencode_result x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003585
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003586 /* find all unencodable characters */
3587 while (collendpos < size) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003588 PyObject *rep;
3589 if (mapping->ob_type == &EncodingMapType) {
3590 int res = encoding_map_lookup(p[collendpos], mapping);
3591 if (res != -1)
3592 break;
3593 ++collendpos;
3594 continue;
3595 }
3596
3597 rep = charmapencode_lookup(p[collendpos], mapping);
3598 if (rep==NULL)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003599 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003600 else if (rep!=Py_None) {
3601 Py_DECREF(rep);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003602 break;
3603 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003604 Py_DECREF(rep);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003605 ++collendpos;
3606 }
3607 /* cache callback name lookup
3608 * (if not done yet, i.e. it's the first error) */
3609 if (*known_errorHandler==-1) {
3610 if ((errors==NULL) || (!strcmp(errors, "strict")))
3611 *known_errorHandler = 1;
3612 else if (!strcmp(errors, "replace"))
3613 *known_errorHandler = 2;
3614 else if (!strcmp(errors, "ignore"))
3615 *known_errorHandler = 3;
3616 else if (!strcmp(errors, "xmlcharrefreplace"))
3617 *known_errorHandler = 4;
3618 else
3619 *known_errorHandler = 0;
3620 }
3621 switch (*known_errorHandler) {
3622 case 1: /* strict */
3623 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
3624 return -1;
3625 case 2: /* replace */
3626 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
3627 x = charmapencode_output('?', mapping, res, respos);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003628 if (x==enc_EXCEPTION) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003629 return -1;
3630 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003631 else if (x==enc_FAILED) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003632 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
3633 return -1;
3634 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003635 }
3636 /* fall through */
3637 case 3: /* ignore */
3638 *inpos = collendpos;
3639 break;
3640 case 4: /* xmlcharrefreplace */
3641 /* generate replacement (temporarily (mis)uses p) */
3642 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
3643 char buffer[2+29+1+1];
3644 char *cp;
3645 sprintf(buffer, "&#%d;", (int)p[collpos]);
3646 for (cp = buffer; *cp; ++cp) {
3647 x = charmapencode_output(*cp, mapping, res, respos);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003648 if (x==enc_EXCEPTION)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003649 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003650 else if (x==enc_FAILED) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003651 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
3652 return -1;
3653 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003654 }
3655 }
3656 *inpos = collendpos;
3657 break;
3658 default:
Walter Dörwalde5402fb2003-08-14 20:25:29 +00003659 repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003660 encoding, reason, p, size, exceptionObject,
3661 collstartpos, collendpos, &newpos);
3662 if (repunicode == NULL)
3663 return -1;
3664 /* generate replacement */
3665 repsize = PyUnicode_GET_SIZE(repunicode);
3666 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
3667 x = charmapencode_output(*uni2, mapping, res, respos);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003668 if (x==enc_EXCEPTION) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003669 return -1;
3670 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003671 else if (x==enc_FAILED) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003672 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003673 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
3674 return -1;
3675 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003676 }
3677 *inpos = newpos;
3678 Py_DECREF(repunicode);
3679 }
3680 return 0;
3681}
3682
Guido van Rossumd57fd912000-03-10 22:53:23 +00003683PyObject *PyUnicode_EncodeCharmap(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003684 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003685 PyObject *mapping,
3686 const char *errors)
3687{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003688 /* output object */
3689 PyObject *res = NULL;
3690 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003691 Py_ssize_t inpos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003692 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003693 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003694 PyObject *errorHandler = NULL;
3695 PyObject *exc = NULL;
3696 /* the following variable is used for caching string comparisons
3697 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
3698 * 3=ignore, 4=xmlcharrefreplace */
3699 int known_errorHandler = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003700
3701 /* Default to Latin-1 */
3702 if (mapping == NULL)
3703 return PyUnicode_EncodeLatin1(p, size, errors);
3704
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003705 /* allocate enough for a simple encoding without
3706 replacements, if we need more, we'll resize */
3707 res = PyString_FromStringAndSize(NULL, size);
3708 if (res == NULL)
3709 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00003710 if (size == 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003711 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003712
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003713 while (inpos<size) {
3714 /* try to encode it */
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003715 charmapencode_result x = charmapencode_output(p[inpos], mapping, &res, &respos);
3716 if (x==enc_EXCEPTION) /* error */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003717 goto onError;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003718 if (x==enc_FAILED) { /* unencodable character */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003719 if (charmap_encoding_error(p, size, &inpos, mapping,
3720 &exc,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00003721 &known_errorHandler, &errorHandler, errors,
Walter Dörwald9b30f202003-08-15 16:26:34 +00003722 &res, &respos)) {
Marc-André Lemburg3a645e42001-01-16 11:54:12 +00003723 goto onError;
Walter Dörwald9b30f202003-08-15 16:26:34 +00003724 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003725 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003726 else
3727 /* done with this character => adjust input position */
3728 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003729 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003730
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003731 /* Resize if we allocated to much */
3732 if (respos<PyString_GET_SIZE(res)) {
3733 if (_PyString_Resize(&res, respos))
3734 goto onError;
3735 }
3736 Py_XDECREF(exc);
3737 Py_XDECREF(errorHandler);
3738 return res;
3739
3740 onError:
3741 Py_XDECREF(res);
3742 Py_XDECREF(exc);
3743 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003744 return NULL;
3745}
3746
3747PyObject *PyUnicode_AsCharmapString(PyObject *unicode,
3748 PyObject *mapping)
3749{
3750 if (!PyUnicode_Check(unicode) || mapping == NULL) {
3751 PyErr_BadArgument();
3752 return NULL;
3753 }
3754 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
3755 PyUnicode_GET_SIZE(unicode),
3756 mapping,
3757 NULL);
3758}
3759
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003760/* create or adjust a UnicodeTranslateError */
3761static void make_translate_exception(PyObject **exceptionObject,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003762 const Py_UNICODE *unicode, Py_ssize_t size,
3763 Py_ssize_t startpos, Py_ssize_t endpos,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003764 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003765{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003766 if (*exceptionObject == NULL) {
3767 *exceptionObject = PyUnicodeTranslateError_Create(
3768 unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003769 }
3770 else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003771 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
3772 goto onError;
3773 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
3774 goto onError;
3775 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
3776 goto onError;
3777 return;
3778 onError:
3779 Py_DECREF(*exceptionObject);
3780 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003781 }
3782}
3783
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003784/* raises a UnicodeTranslateError */
3785static void raise_translate_exception(PyObject **exceptionObject,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003786 const Py_UNICODE *unicode, Py_ssize_t size,
3787 Py_ssize_t startpos, Py_ssize_t endpos,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003788 const char *reason)
3789{
3790 make_translate_exception(exceptionObject,
3791 unicode, size, startpos, endpos, reason);
3792 if (*exceptionObject != NULL)
3793 PyCodec_StrictErrors(*exceptionObject);
3794}
3795
3796/* error handling callback helper:
3797 build arguments, call the callback and check the arguments,
3798 put the result into newpos and return the replacement string, which
3799 has to be freed by the caller */
3800static PyObject *unicode_translate_call_errorhandler(const char *errors,
3801 PyObject **errorHandler,
3802 const char *reason,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003803 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
3804 Py_ssize_t startpos, Py_ssize_t endpos,
3805 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003806{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003807 static char *argparse = "O!n;translating error handler must return (unicode, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003808
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003809 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003810 PyObject *restuple;
3811 PyObject *resunicode;
3812
3813 if (*errorHandler == NULL) {
3814 *errorHandler = PyCodec_LookupError(errors);
3815 if (*errorHandler == NULL)
3816 return NULL;
3817 }
3818
3819 make_translate_exception(exceptionObject,
3820 unicode, size, startpos, endpos, reason);
3821 if (*exceptionObject == NULL)
3822 return NULL;
3823
3824 restuple = PyObject_CallFunctionObjArgs(
3825 *errorHandler, *exceptionObject, NULL);
3826 if (restuple == NULL)
3827 return NULL;
3828 if (!PyTuple_Check(restuple)) {
3829 PyErr_Format(PyExc_TypeError, &argparse[4]);
3830 Py_DECREF(restuple);
3831 return NULL;
3832 }
3833 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003834 &resunicode, &i_newpos)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003835 Py_DECREF(restuple);
3836 return NULL;
3837 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00003838 if (i_newpos<0)
3839 *newpos = size+i_newpos;
3840 else
3841 *newpos = i_newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003842 if (*newpos<0 || *newpos>size) {
Martin v. Löwis2c95cc62006-02-16 06:54:25 +00003843 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003844 Py_DECREF(restuple);
3845 return NULL;
3846 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003847 Py_INCREF(resunicode);
3848 Py_DECREF(restuple);
3849 return resunicode;
3850}
3851
3852/* Lookup the character ch in the mapping and put the result in result,
3853 which must be decrefed by the caller.
3854 Return 0 on success, -1 on error */
3855static
3856int charmaptranslate_lookup(Py_UNICODE c, PyObject *mapping, PyObject **result)
3857{
3858 PyObject *w = PyInt_FromLong((long)c);
3859 PyObject *x;
3860
3861 if (w == NULL)
3862 return -1;
3863 x = PyObject_GetItem(mapping, w);
3864 Py_DECREF(w);
3865 if (x == NULL) {
3866 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
3867 /* No mapping found means: use 1:1 mapping. */
3868 PyErr_Clear();
3869 *result = NULL;
3870 return 0;
3871 } else
3872 return -1;
3873 }
3874 else if (x == Py_None) {
3875 *result = x;
3876 return 0;
3877 }
3878 else if (PyInt_Check(x)) {
3879 long value = PyInt_AS_LONG(x);
3880 long max = PyUnicode_GetMax();
3881 if (value < 0 || value > max) {
3882 PyErr_Format(PyExc_TypeError,
3883 "character mapping must be in range(0x%lx)", max+1);
3884 Py_DECREF(x);
3885 return -1;
3886 }
3887 *result = x;
3888 return 0;
3889 }
3890 else if (PyUnicode_Check(x)) {
3891 *result = x;
3892 return 0;
3893 }
3894 else {
3895 /* wrong return value */
3896 PyErr_SetString(PyExc_TypeError,
3897 "character mapping must return integer, None or unicode");
Walter Dörwald150523e2003-08-15 16:52:19 +00003898 Py_DECREF(x);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003899 return -1;
3900 }
3901}
3902/* ensure that *outobj is at least requiredsize characters long,
3903if not reallocate and adjust various state variables.
3904Return 0 on success, -1 on error */
3905static
Walter Dörwald4894c302003-10-24 14:25:28 +00003906int charmaptranslate_makespace(PyObject **outobj, Py_UNICODE **outp,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003907 Py_ssize_t requiredsize)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003908{
Martin v. Löwis18e16552006-02-15 17:27:45 +00003909 Py_ssize_t oldsize = PyUnicode_GET_SIZE(*outobj);
Walter Dörwald4894c302003-10-24 14:25:28 +00003910 if (requiredsize > oldsize) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003911 /* remember old output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003912 Py_ssize_t outpos = *outp-PyUnicode_AS_UNICODE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003913 /* exponentially overallocate to minimize reallocations */
Walter Dörwald4894c302003-10-24 14:25:28 +00003914 if (requiredsize < 2 * oldsize)
3915 requiredsize = 2 * oldsize;
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00003916 if (_PyUnicode_Resize(outobj, requiredsize) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003917 return -1;
3918 *outp = PyUnicode_AS_UNICODE(*outobj) + outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003919 }
3920 return 0;
3921}
3922/* lookup the character, put the result in the output string and adjust
3923 various state variables. Return a new reference to the object that
3924 was put in the output buffer in *result, or Py_None, if the mapping was
3925 undefined (in which case no character was written).
3926 The called must decref result.
3927 Return 0 on success, -1 on error. */
3928static
Walter Dörwald4894c302003-10-24 14:25:28 +00003929int charmaptranslate_output(const Py_UNICODE *startinp, const Py_UNICODE *curinp,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003930 Py_ssize_t insize, PyObject *mapping, PyObject **outobj, Py_UNICODE **outp,
Walter Dörwald4894c302003-10-24 14:25:28 +00003931 PyObject **res)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003932{
Walter Dörwald4894c302003-10-24 14:25:28 +00003933 if (charmaptranslate_lookup(*curinp, mapping, res))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003934 return -1;
3935 if (*res==NULL) {
3936 /* not found => default to 1:1 mapping */
Walter Dörwald4894c302003-10-24 14:25:28 +00003937 *(*outp)++ = *curinp;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003938 }
3939 else if (*res==Py_None)
3940 ;
3941 else if (PyInt_Check(*res)) {
3942 /* no overflow check, because we know that the space is enough */
3943 *(*outp)++ = (Py_UNICODE)PyInt_AS_LONG(*res);
3944 }
3945 else if (PyUnicode_Check(*res)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00003946 Py_ssize_t repsize = PyUnicode_GET_SIZE(*res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003947 if (repsize==1) {
3948 /* no overflow check, because we know that the space is enough */
3949 *(*outp)++ = *PyUnicode_AS_UNICODE(*res);
3950 }
3951 else if (repsize!=0) {
3952 /* more than one character */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003953 Py_ssize_t requiredsize = (*outp-PyUnicode_AS_UNICODE(*outobj)) +
Walter Dörwaldcd736e72004-02-05 17:36:00 +00003954 (insize - (curinp-startinp)) +
Walter Dörwald4894c302003-10-24 14:25:28 +00003955 repsize - 1;
3956 if (charmaptranslate_makespace(outobj, outp, requiredsize))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003957 return -1;
3958 memcpy(*outp, PyUnicode_AS_UNICODE(*res), sizeof(Py_UNICODE)*repsize);
3959 *outp += repsize;
3960 }
3961 }
3962 else
3963 return -1;
3964 return 0;
3965}
3966
3967PyObject *PyUnicode_TranslateCharmap(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003968 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003969 PyObject *mapping,
3970 const char *errors)
3971{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003972 /* output object */
3973 PyObject *res = NULL;
3974 /* pointers to the beginning and end+1 of input */
3975 const Py_UNICODE *startp = p;
3976 const Py_UNICODE *endp = p + size;
3977 /* pointer into the output */
3978 Py_UNICODE *str;
3979 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003980 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003981 char *reason = "character maps to <undefined>";
3982 PyObject *errorHandler = NULL;
3983 PyObject *exc = NULL;
3984 /* the following variable is used for caching string comparisons
3985 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
3986 * 3=ignore, 4=xmlcharrefreplace */
3987 int known_errorHandler = -1;
3988
Guido van Rossumd57fd912000-03-10 22:53:23 +00003989 if (mapping == NULL) {
3990 PyErr_BadArgument();
3991 return NULL;
3992 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003993
3994 /* allocate enough for a simple 1:1 translation without
3995 replacements, if we need more, we'll resize */
3996 res = PyUnicode_FromUnicode(NULL, size);
3997 if (res == NULL)
Walter Dörwald4894c302003-10-24 14:25:28 +00003998 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003999 if (size == 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004000 return res;
4001 str = PyUnicode_AS_UNICODE(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004002
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004003 while (p<endp) {
4004 /* try to encode it */
4005 PyObject *x = NULL;
Walter Dörwald4894c302003-10-24 14:25:28 +00004006 if (charmaptranslate_output(startp, p, size, mapping, &res, &str, &x)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004007 Py_XDECREF(x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004008 goto onError;
4009 }
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00004010 Py_XDECREF(x);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004011 if (x!=Py_None) /* it worked => adjust input pointer */
4012 ++p;
4013 else { /* untranslatable character */
4014 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004015 Py_ssize_t repsize;
4016 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004017 Py_UNICODE *uni2;
4018 /* startpos for collecting untranslatable chars */
4019 const Py_UNICODE *collstart = p;
4020 const Py_UNICODE *collend = p+1;
4021 const Py_UNICODE *coll;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004022
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004023 /* find all untranslatable characters */
4024 while (collend < endp) {
Walter Dörwald4894c302003-10-24 14:25:28 +00004025 if (charmaptranslate_lookup(*collend, mapping, &x))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004026 goto onError;
4027 Py_XDECREF(x);
4028 if (x!=Py_None)
4029 break;
4030 ++collend;
4031 }
4032 /* cache callback name lookup
4033 * (if not done yet, i.e. it's the first error) */
4034 if (known_errorHandler==-1) {
4035 if ((errors==NULL) || (!strcmp(errors, "strict")))
4036 known_errorHandler = 1;
4037 else if (!strcmp(errors, "replace"))
4038 known_errorHandler = 2;
4039 else if (!strcmp(errors, "ignore"))
4040 known_errorHandler = 3;
4041 else if (!strcmp(errors, "xmlcharrefreplace"))
4042 known_errorHandler = 4;
4043 else
4044 known_errorHandler = 0;
4045 }
4046 switch (known_errorHandler) {
4047 case 1: /* strict */
4048 raise_translate_exception(&exc, startp, size, collstart-startp, collend-startp, reason);
4049 goto onError;
4050 case 2: /* replace */
4051 /* No need to check for space, this is a 1:1 replacement */
4052 for (coll = collstart; coll<collend; ++coll)
4053 *str++ = '?';
4054 /* fall through */
4055 case 3: /* ignore */
4056 p = collend;
4057 break;
4058 case 4: /* xmlcharrefreplace */
4059 /* generate replacement (temporarily (mis)uses p) */
4060 for (p = collstart; p < collend; ++p) {
4061 char buffer[2+29+1+1];
4062 char *cp;
4063 sprintf(buffer, "&#%d;", (int)*p);
Walter Dörwald4894c302003-10-24 14:25:28 +00004064 if (charmaptranslate_makespace(&res, &str,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004065 (str-PyUnicode_AS_UNICODE(res))+strlen(buffer)+(endp-collend)))
4066 goto onError;
4067 for (cp = buffer; *cp; ++cp)
4068 *str++ = *cp;
4069 }
4070 p = collend;
4071 break;
4072 default:
4073 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
4074 reason, startp, size, &exc,
4075 collstart-startp, collend-startp, &newpos);
4076 if (repunicode == NULL)
4077 goto onError;
4078 /* generate replacement */
4079 repsize = PyUnicode_GET_SIZE(repunicode);
Walter Dörwald4894c302003-10-24 14:25:28 +00004080 if (charmaptranslate_makespace(&res, &str,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004081 (str-PyUnicode_AS_UNICODE(res))+repsize+(endp-collend))) {
4082 Py_DECREF(repunicode);
4083 goto onError;
4084 }
4085 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2)
4086 *str++ = *uni2;
4087 p = startp + newpos;
4088 Py_DECREF(repunicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004089 }
4090 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004091 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004092 /* Resize if we allocated to much */
4093 respos = str-PyUnicode_AS_UNICODE(res);
Walter Dörwald4894c302003-10-24 14:25:28 +00004094 if (respos<PyUnicode_GET_SIZE(res)) {
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00004095 if (_PyUnicode_Resize(&res, respos) < 0)
Guido van Rossumfd4b9572000-04-10 13:51:10 +00004096 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004097 }
4098 Py_XDECREF(exc);
4099 Py_XDECREF(errorHandler);
4100 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004101
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004102 onError:
4103 Py_XDECREF(res);
4104 Py_XDECREF(exc);
4105 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004106 return NULL;
4107}
4108
4109PyObject *PyUnicode_Translate(PyObject *str,
4110 PyObject *mapping,
4111 const char *errors)
4112{
4113 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00004114
Guido van Rossumd57fd912000-03-10 22:53:23 +00004115 str = PyUnicode_FromObject(str);
4116 if (str == NULL)
4117 goto onError;
4118 result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str),
4119 PyUnicode_GET_SIZE(str),
4120 mapping,
4121 errors);
4122 Py_DECREF(str);
4123 return result;
Tim Petersced69f82003-09-16 20:30:58 +00004124
Guido van Rossumd57fd912000-03-10 22:53:23 +00004125 onError:
4126 Py_XDECREF(str);
4127 return NULL;
4128}
Tim Petersced69f82003-09-16 20:30:58 +00004129
Guido van Rossum9e896b32000-04-05 20:11:21 +00004130/* --- Decimal Encoder ---------------------------------------------------- */
4131
4132int PyUnicode_EncodeDecimal(Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004133 Py_ssize_t length,
Guido van Rossum9e896b32000-04-05 20:11:21 +00004134 char *output,
4135 const char *errors)
4136{
4137 Py_UNICODE *p, *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004138 PyObject *errorHandler = NULL;
4139 PyObject *exc = NULL;
4140 const char *encoding = "decimal";
4141 const char *reason = "invalid decimal Unicode string";
4142 /* the following variable is used for caching string comparisons
4143 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
4144 int known_errorHandler = -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00004145
4146 if (output == NULL) {
4147 PyErr_BadArgument();
4148 return -1;
4149 }
4150
4151 p = s;
4152 end = s + length;
4153 while (p < end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004154 register Py_UNICODE ch = *p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00004155 int decimal;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004156 PyObject *repunicode;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004157 Py_ssize_t repsize;
4158 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004159 Py_UNICODE *uni2;
4160 Py_UNICODE *collstart;
4161 Py_UNICODE *collend;
Tim Petersced69f82003-09-16 20:30:58 +00004162
Guido van Rossum9e896b32000-04-05 20:11:21 +00004163 if (Py_UNICODE_ISSPACE(ch)) {
4164 *output++ = ' ';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004165 ++p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00004166 continue;
4167 }
4168 decimal = Py_UNICODE_TODECIMAL(ch);
4169 if (decimal >= 0) {
4170 *output++ = '0' + decimal;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004171 ++p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00004172 continue;
4173 }
Guido van Rossumba477042000-04-06 18:18:10 +00004174 if (0 < ch && ch < 256) {
Guido van Rossum42c29aa2000-05-03 23:58:29 +00004175 *output++ = (char)ch;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004176 ++p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00004177 continue;
4178 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004179 /* All other characters are considered unencodable */
4180 collstart = p;
4181 collend = p+1;
4182 while (collend < end) {
4183 if ((0 < *collend && *collend < 256) ||
4184 !Py_UNICODE_ISSPACE(*collend) ||
4185 Py_UNICODE_TODECIMAL(*collend))
4186 break;
Guido van Rossum9e896b32000-04-05 20:11:21 +00004187 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004188 /* cache callback name lookup
4189 * (if not done yet, i.e. it's the first error) */
4190 if (known_errorHandler==-1) {
4191 if ((errors==NULL) || (!strcmp(errors, "strict")))
4192 known_errorHandler = 1;
4193 else if (!strcmp(errors, "replace"))
4194 known_errorHandler = 2;
4195 else if (!strcmp(errors, "ignore"))
4196 known_errorHandler = 3;
4197 else if (!strcmp(errors, "xmlcharrefreplace"))
4198 known_errorHandler = 4;
4199 else
4200 known_errorHandler = 0;
4201 }
4202 switch (known_errorHandler) {
4203 case 1: /* strict */
4204 raise_encode_exception(&exc, encoding, s, length, collstart-s, collend-s, reason);
4205 goto onError;
4206 case 2: /* replace */
4207 for (p = collstart; p < collend; ++p)
4208 *output++ = '?';
4209 /* fall through */
4210 case 3: /* ignore */
4211 p = collend;
4212 break;
4213 case 4: /* xmlcharrefreplace */
4214 /* generate replacement (temporarily (mis)uses p) */
4215 for (p = collstart; p < collend; ++p)
4216 output += sprintf(output, "&#%d;", (int)*p);
4217 p = collend;
4218 break;
4219 default:
4220 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
4221 encoding, reason, s, length, &exc,
4222 collstart-s, collend-s, &newpos);
4223 if (repunicode == NULL)
4224 goto onError;
4225 /* generate replacement */
4226 repsize = PyUnicode_GET_SIZE(repunicode);
4227 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
4228 Py_UNICODE ch = *uni2;
4229 if (Py_UNICODE_ISSPACE(ch))
4230 *output++ = ' ';
4231 else {
4232 decimal = Py_UNICODE_TODECIMAL(ch);
4233 if (decimal >= 0)
4234 *output++ = '0' + decimal;
4235 else if (0 < ch && ch < 256)
4236 *output++ = (char)ch;
4237 else {
4238 Py_DECREF(repunicode);
4239 raise_encode_exception(&exc, encoding,
4240 s, length, collstart-s, collend-s, reason);
4241 goto onError;
4242 }
4243 }
4244 }
4245 p = s + newpos;
4246 Py_DECREF(repunicode);
Guido van Rossum9e896b32000-04-05 20:11:21 +00004247 }
4248 }
4249 /* 0-terminate the output string */
4250 *output++ = '\0';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004251 Py_XDECREF(exc);
4252 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00004253 return 0;
4254
4255 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004256 Py_XDECREF(exc);
4257 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00004258 return -1;
4259}
4260
Guido van Rossumd57fd912000-03-10 22:53:23 +00004261/* --- Helpers ------------------------------------------------------------ */
4262
Thomas Wouters477c8d52006-05-27 19:21:47 +00004263#define STRINGLIB_CHAR Py_UNICODE
4264
4265#define STRINGLIB_LEN PyUnicode_GET_SIZE
4266#define STRINGLIB_NEW PyUnicode_FromUnicode
4267#define STRINGLIB_STR PyUnicode_AS_UNICODE
4268
4269Py_LOCAL_INLINE(int)
4270STRINGLIB_CMP(const Py_UNICODE* str, const Py_UNICODE* other, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004271{
Thomas Wouters477c8d52006-05-27 19:21:47 +00004272 if (str[0] != other[0])
4273 return 1;
4274 return memcmp((void*) str, (void*) other, len * sizeof(Py_UNICODE));
Guido van Rossumd57fd912000-03-10 22:53:23 +00004275}
4276
Thomas Wouters477c8d52006-05-27 19:21:47 +00004277#define STRINGLIB_EMPTY unicode_empty
4278
4279#include "stringlib/fastsearch.h"
4280
4281#include "stringlib/count.h"
4282#include "stringlib/find.h"
4283#include "stringlib/partition.h"
4284
4285/* helper macro to fixup start/end slice values */
4286#define FIX_START_END(obj) \
4287 if (start < 0) \
4288 start += (obj)->length; \
4289 if (start < 0) \
4290 start = 0; \
4291 if (end > (obj)->length) \
4292 end = (obj)->length; \
4293 if (end < 0) \
4294 end += (obj)->length; \
4295 if (end < 0) \
4296 end = 0;
4297
Martin v. Löwis18e16552006-02-15 17:27:45 +00004298Py_ssize_t PyUnicode_Count(PyObject *str,
Thomas Wouters477c8d52006-05-27 19:21:47 +00004299 PyObject *substr,
4300 Py_ssize_t start,
4301 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004302{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004303 Py_ssize_t result;
Thomas Wouters477c8d52006-05-27 19:21:47 +00004304 PyUnicodeObject* str_obj;
4305 PyUnicodeObject* sub_obj;
Tim Petersced69f82003-09-16 20:30:58 +00004306
Thomas Wouters477c8d52006-05-27 19:21:47 +00004307 str_obj = (PyUnicodeObject*) PyUnicode_FromObject(str);
4308 if (!str_obj)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004309 return -1;
Thomas Wouters477c8d52006-05-27 19:21:47 +00004310 sub_obj = (PyUnicodeObject*) PyUnicode_FromObject(substr);
4311 if (!sub_obj) {
4312 Py_DECREF(str_obj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004313 return -1;
4314 }
Tim Petersced69f82003-09-16 20:30:58 +00004315
Thomas Wouters477c8d52006-05-27 19:21:47 +00004316 FIX_START_END(str_obj);
Tim Petersced69f82003-09-16 20:30:58 +00004317
Thomas Wouters477c8d52006-05-27 19:21:47 +00004318 result = stringlib_count(
4319 str_obj->str + start, end - start, sub_obj->str, sub_obj->length
4320 );
4321
4322 Py_DECREF(sub_obj);
4323 Py_DECREF(str_obj);
4324
Guido van Rossumd57fd912000-03-10 22:53:23 +00004325 return result;
4326}
4327
Martin v. Löwis18e16552006-02-15 17:27:45 +00004328Py_ssize_t PyUnicode_Find(PyObject *str,
Thomas Wouters477c8d52006-05-27 19:21:47 +00004329 PyObject *sub,
4330 Py_ssize_t start,
4331 Py_ssize_t end,
4332 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004333{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004334 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00004335
Guido van Rossumd57fd912000-03-10 22:53:23 +00004336 str = PyUnicode_FromObject(str);
Thomas Wouters477c8d52006-05-27 19:21:47 +00004337 if (!str)
Marc-André Lemburg4da6fd62002-05-29 11:33:13 +00004338 return -2;
Thomas Wouters477c8d52006-05-27 19:21:47 +00004339 sub = PyUnicode_FromObject(sub);
4340 if (!sub) {
Marc-André Lemburg41644392002-05-29 13:46:29 +00004341 Py_DECREF(str);
Marc-André Lemburg4da6fd62002-05-29 11:33:13 +00004342 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004343 }
Tim Petersced69f82003-09-16 20:30:58 +00004344
Thomas Wouters477c8d52006-05-27 19:21:47 +00004345 if (direction > 0)
4346 result = stringlib_find_slice(
4347 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
4348 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
4349 start, end
4350 );
4351 else
4352 result = stringlib_rfind_slice(
4353 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
4354 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
4355 start, end
4356 );
4357
Guido van Rossumd57fd912000-03-10 22:53:23 +00004358 Py_DECREF(str);
Thomas Wouters477c8d52006-05-27 19:21:47 +00004359 Py_DECREF(sub);
4360
Guido van Rossumd57fd912000-03-10 22:53:23 +00004361 return result;
4362}
4363
Tim Petersced69f82003-09-16 20:30:58 +00004364static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004365int tailmatch(PyUnicodeObject *self,
4366 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004367 Py_ssize_t start,
4368 Py_ssize_t end,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004369 int direction)
4370{
Guido van Rossumd57fd912000-03-10 22:53:23 +00004371 if (substring->length == 0)
4372 return 1;
4373
Thomas Wouters477c8d52006-05-27 19:21:47 +00004374 FIX_START_END(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004375
4376 end -= substring->length;
4377 if (end < start)
4378 return 0;
4379
4380 if (direction > 0) {
4381 if (Py_UNICODE_MATCH(self, end, substring))
4382 return 1;
4383 } else {
4384 if (Py_UNICODE_MATCH(self, start, substring))
4385 return 1;
4386 }
4387
4388 return 0;
4389}
4390
Martin v. Löwis18e16552006-02-15 17:27:45 +00004391Py_ssize_t PyUnicode_Tailmatch(PyObject *str,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004392 PyObject *substr,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004393 Py_ssize_t start,
4394 Py_ssize_t end,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004395 int direction)
4396{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004397 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00004398
Guido van Rossumd57fd912000-03-10 22:53:23 +00004399 str = PyUnicode_FromObject(str);
4400 if (str == NULL)
4401 return -1;
4402 substr = PyUnicode_FromObject(substr);
4403 if (substr == NULL) {
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00004404 Py_DECREF(str);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004405 return -1;
4406 }
Tim Petersced69f82003-09-16 20:30:58 +00004407
Guido van Rossumd57fd912000-03-10 22:53:23 +00004408 result = tailmatch((PyUnicodeObject *)str,
4409 (PyUnicodeObject *)substr,
4410 start, end, direction);
4411 Py_DECREF(str);
4412 Py_DECREF(substr);
4413 return result;
4414}
4415
Guido van Rossumd57fd912000-03-10 22:53:23 +00004416/* Apply fixfct filter to the Unicode object self and return a
4417 reference to the modified object */
4418
Tim Petersced69f82003-09-16 20:30:58 +00004419static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004420PyObject *fixup(PyUnicodeObject *self,
4421 int (*fixfct)(PyUnicodeObject *s))
4422{
4423
4424 PyUnicodeObject *u;
4425
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004426 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004427 if (u == NULL)
4428 return NULL;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004429
4430 Py_UNICODE_COPY(u->str, self->str, self->length);
4431
Tim Peters7a29bd52001-09-12 03:03:31 +00004432 if (!fixfct(u) && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004433 /* fixfct should return TRUE if it modified the buffer. If
4434 FALSE, return a reference to the original buffer instead
4435 (to save space, not time) */
4436 Py_INCREF(self);
4437 Py_DECREF(u);
4438 return (PyObject*) self;
4439 }
4440 return (PyObject*) u;
4441}
4442
Tim Petersced69f82003-09-16 20:30:58 +00004443static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004444int fixupper(PyUnicodeObject *self)
4445{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004446 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004447 Py_UNICODE *s = self->str;
4448 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00004449
Guido van Rossumd57fd912000-03-10 22:53:23 +00004450 while (len-- > 0) {
4451 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00004452
Guido van Rossumd57fd912000-03-10 22:53:23 +00004453 ch = Py_UNICODE_TOUPPER(*s);
4454 if (ch != *s) {
4455 status = 1;
4456 *s = ch;
4457 }
4458 s++;
4459 }
4460
4461 return status;
4462}
4463
Tim Petersced69f82003-09-16 20:30:58 +00004464static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004465int fixlower(PyUnicodeObject *self)
4466{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004467 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004468 Py_UNICODE *s = self->str;
4469 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00004470
Guido van Rossumd57fd912000-03-10 22:53:23 +00004471 while (len-- > 0) {
4472 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00004473
Guido van Rossumd57fd912000-03-10 22:53:23 +00004474 ch = Py_UNICODE_TOLOWER(*s);
4475 if (ch != *s) {
4476 status = 1;
4477 *s = ch;
4478 }
4479 s++;
4480 }
4481
4482 return status;
4483}
4484
Tim Petersced69f82003-09-16 20:30:58 +00004485static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004486int fixswapcase(PyUnicodeObject *self)
4487{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004488 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004489 Py_UNICODE *s = self->str;
4490 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00004491
Guido van Rossumd57fd912000-03-10 22:53:23 +00004492 while (len-- > 0) {
4493 if (Py_UNICODE_ISUPPER(*s)) {
4494 *s = Py_UNICODE_TOLOWER(*s);
4495 status = 1;
4496 } else if (Py_UNICODE_ISLOWER(*s)) {
4497 *s = Py_UNICODE_TOUPPER(*s);
4498 status = 1;
4499 }
4500 s++;
4501 }
4502
4503 return status;
4504}
4505
Tim Petersced69f82003-09-16 20:30:58 +00004506static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004507int fixcapitalize(PyUnicodeObject *self)
4508{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004509 Py_ssize_t len = self->length;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00004510 Py_UNICODE *s = self->str;
4511 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00004512
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00004513 if (len == 0)
4514 return 0;
4515 if (Py_UNICODE_ISLOWER(*s)) {
4516 *s = Py_UNICODE_TOUPPER(*s);
4517 status = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004518 }
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00004519 s++;
4520 while (--len > 0) {
4521 if (Py_UNICODE_ISUPPER(*s)) {
4522 *s = Py_UNICODE_TOLOWER(*s);
4523 status = 1;
4524 }
4525 s++;
4526 }
4527 return status;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004528}
4529
4530static
4531int fixtitle(PyUnicodeObject *self)
4532{
4533 register Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
4534 register Py_UNICODE *e;
4535 int previous_is_cased;
4536
4537 /* Shortcut for single character strings */
4538 if (PyUnicode_GET_SIZE(self) == 1) {
4539 Py_UNICODE ch = Py_UNICODE_TOTITLE(*p);
4540 if (*p != ch) {
4541 *p = ch;
4542 return 1;
4543 }
4544 else
4545 return 0;
4546 }
Tim Petersced69f82003-09-16 20:30:58 +00004547
Guido van Rossumd57fd912000-03-10 22:53:23 +00004548 e = p + PyUnicode_GET_SIZE(self);
4549 previous_is_cased = 0;
4550 for (; p < e; p++) {
4551 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00004552
Guido van Rossumd57fd912000-03-10 22:53:23 +00004553 if (previous_is_cased)
4554 *p = Py_UNICODE_TOLOWER(ch);
4555 else
4556 *p = Py_UNICODE_TOTITLE(ch);
Tim Petersced69f82003-09-16 20:30:58 +00004557
4558 if (Py_UNICODE_ISLOWER(ch) ||
4559 Py_UNICODE_ISUPPER(ch) ||
Guido van Rossumd57fd912000-03-10 22:53:23 +00004560 Py_UNICODE_ISTITLE(ch))
4561 previous_is_cased = 1;
4562 else
4563 previous_is_cased = 0;
4564 }
4565 return 1;
4566}
4567
Tim Peters8ce9f162004-08-27 01:49:32 +00004568PyObject *
4569PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004570{
Tim Peters8ce9f162004-08-27 01:49:32 +00004571 PyObject *internal_separator = NULL;
Skip Montanaro6543b452004-09-16 03:28:13 +00004572 const Py_UNICODE blank = ' ';
4573 const Py_UNICODE *sep = &blank;
Thomas Wouters477c8d52006-05-27 19:21:47 +00004574 Py_ssize_t seplen = 1;
Tim Peters05eba1f2004-08-27 21:32:02 +00004575 PyUnicodeObject *res = NULL; /* the result */
Thomas Wouters477c8d52006-05-27 19:21:47 +00004576 Py_ssize_t res_alloc = 100; /* # allocated bytes for string in res */
4577 Py_ssize_t res_used; /* # used bytes */
Tim Peters05eba1f2004-08-27 21:32:02 +00004578 Py_UNICODE *res_p; /* pointer to free byte in res's string area */
4579 PyObject *fseq; /* PySequence_Fast(seq) */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004580 Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */
Tim Peters8ce9f162004-08-27 01:49:32 +00004581 PyObject *item;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004582 Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004583
Tim Peters05eba1f2004-08-27 21:32:02 +00004584 fseq = PySequence_Fast(seq, "");
4585 if (fseq == NULL) {
Tim Peters05eba1f2004-08-27 21:32:02 +00004586 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +00004587 }
4588
Tim Peters91879ab2004-08-27 22:35:44 +00004589 /* Grrrr. A codec may be invoked to convert str objects to
4590 * Unicode, and so it's possible to call back into Python code
4591 * during PyUnicode_FromObject(), and so it's possible for a sick
4592 * codec to change the size of fseq (if seq is a list). Therefore
4593 * we have to keep refetching the size -- can't assume seqlen
4594 * is invariant.
4595 */
Tim Peters05eba1f2004-08-27 21:32:02 +00004596 seqlen = PySequence_Fast_GET_SIZE(fseq);
4597 /* If empty sequence, return u"". */
4598 if (seqlen == 0) {
4599 res = _PyUnicode_New(0); /* empty sequence; return u"" */
4600 goto Done;
4601 }
4602 /* If singleton sequence with an exact Unicode, return that. */
4603 if (seqlen == 1) {
4604 item = PySequence_Fast_GET_ITEM(fseq, 0);
4605 if (PyUnicode_CheckExact(item)) {
4606 Py_INCREF(item);
4607 res = (PyUnicodeObject *)item;
4608 goto Done;
4609 }
Tim Peters8ce9f162004-08-27 01:49:32 +00004610 }
4611
Tim Peters05eba1f2004-08-27 21:32:02 +00004612 /* At least two items to join, or one that isn't exact Unicode. */
4613 if (seqlen > 1) {
4614 /* Set up sep and seplen -- they're needed. */
4615 if (separator == NULL) {
4616 sep = &blank;
4617 seplen = 1;
4618 }
4619 else {
4620 internal_separator = PyUnicode_FromObject(separator);
4621 if (internal_separator == NULL)
4622 goto onError;
4623 sep = PyUnicode_AS_UNICODE(internal_separator);
4624 seplen = PyUnicode_GET_SIZE(internal_separator);
Tim Peters91879ab2004-08-27 22:35:44 +00004625 /* In case PyUnicode_FromObject() mutated seq. */
4626 seqlen = PySequence_Fast_GET_SIZE(fseq);
Tim Peters05eba1f2004-08-27 21:32:02 +00004627 }
4628 }
4629
4630 /* Get space. */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004631 res = _PyUnicode_New(res_alloc);
Tim Peters05eba1f2004-08-27 21:32:02 +00004632 if (res == NULL)
Tim Peters8ce9f162004-08-27 01:49:32 +00004633 goto onError;
Tim Peters05eba1f2004-08-27 21:32:02 +00004634 res_p = PyUnicode_AS_UNICODE(res);
4635 res_used = 0;
Tim Peters8ce9f162004-08-27 01:49:32 +00004636
Tim Peters05eba1f2004-08-27 21:32:02 +00004637 for (i = 0; i < seqlen; ++i) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00004638 Py_ssize_t itemlen;
4639 Py_ssize_t new_res_used;
Tim Peters05eba1f2004-08-27 21:32:02 +00004640
4641 item = PySequence_Fast_GET_ITEM(fseq, i);
4642 /* Convert item to Unicode. */
4643 if (! PyUnicode_Check(item) && ! PyString_Check(item)) {
4644 PyErr_Format(PyExc_TypeError,
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004645 "sequence item %zd: expected string or Unicode,"
Tim Peters05eba1f2004-08-27 21:32:02 +00004646 " %.80s found",
4647 i, item->ob_type->tp_name);
Tim Peters2cfe3682001-05-05 05:36:48 +00004648 goto onError;
Tim Peters8ce9f162004-08-27 01:49:32 +00004649 }
Tim Peters05eba1f2004-08-27 21:32:02 +00004650 item = PyUnicode_FromObject(item);
4651 if (item == NULL)
4652 goto onError;
4653 /* We own a reference to item from here on. */
Tim Petersced69f82003-09-16 20:30:58 +00004654
Tim Peters91879ab2004-08-27 22:35:44 +00004655 /* In case PyUnicode_FromObject() mutated seq. */
4656 seqlen = PySequence_Fast_GET_SIZE(fseq);
4657
Tim Peters8ce9f162004-08-27 01:49:32 +00004658 /* Make sure we have enough space for the separator and the item. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004659 itemlen = PyUnicode_GET_SIZE(item);
Tim Peters05eba1f2004-08-27 21:32:02 +00004660 new_res_used = res_used + itemlen;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004661 if (new_res_used < 0)
Tim Peters8ce9f162004-08-27 01:49:32 +00004662 goto Overflow;
Tim Peters05eba1f2004-08-27 21:32:02 +00004663 if (i < seqlen - 1) {
4664 new_res_used += seplen;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004665 if (new_res_used < 0)
Tim Peters05eba1f2004-08-27 21:32:02 +00004666 goto Overflow;
4667 }
4668 if (new_res_used > res_alloc) {
4669 /* double allocated size until it's big enough */
Tim Peters8ce9f162004-08-27 01:49:32 +00004670 do {
Tim Peters05eba1f2004-08-27 21:32:02 +00004671 res_alloc += res_alloc;
Thomas Wouters477c8d52006-05-27 19:21:47 +00004672 if (res_alloc <= 0)
Tim Peters8ce9f162004-08-27 01:49:32 +00004673 goto Overflow;
Tim Peters05eba1f2004-08-27 21:32:02 +00004674 } while (new_res_used > res_alloc);
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004675 if (_PyUnicode_Resize(&res, res_alloc) < 0) {
Marc-André Lemburg3508e302001-09-20 17:22:58 +00004676 Py_DECREF(item);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004677 goto onError;
Marc-André Lemburg3508e302001-09-20 17:22:58 +00004678 }
Tim Peters05eba1f2004-08-27 21:32:02 +00004679 res_p = PyUnicode_AS_UNICODE(res) + res_used;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004680 }
Tim Peters05eba1f2004-08-27 21:32:02 +00004681
4682 /* Copy item, and maybe the separator. */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004683 Py_UNICODE_COPY(res_p, PyUnicode_AS_UNICODE(item), itemlen);
Tim Peters05eba1f2004-08-27 21:32:02 +00004684 res_p += itemlen;
4685 if (i < seqlen - 1) {
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004686 Py_UNICODE_COPY(res_p, sep, seplen);
Tim Peters05eba1f2004-08-27 21:32:02 +00004687 res_p += seplen;
4688 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004689 Py_DECREF(item);
Tim Peters05eba1f2004-08-27 21:32:02 +00004690 res_used = new_res_used;
4691 }
Tim Peters8ce9f162004-08-27 01:49:32 +00004692
Tim Peters05eba1f2004-08-27 21:32:02 +00004693 /* Shrink res to match the used area; this probably can't fail,
4694 * but it's cheap to check.
4695 */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004696 if (_PyUnicode_Resize(&res, res_used) < 0)
Tim Peters8ce9f162004-08-27 01:49:32 +00004697 goto onError;
4698
4699 Done:
4700 Py_XDECREF(internal_separator);
Tim Peters05eba1f2004-08-27 21:32:02 +00004701 Py_DECREF(fseq);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004702 return (PyObject *)res;
4703
Tim Peters8ce9f162004-08-27 01:49:32 +00004704 Overflow:
4705 PyErr_SetString(PyExc_OverflowError,
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004706 "join() result is too long for a Python string");
Tim Peters8ce9f162004-08-27 01:49:32 +00004707 Py_DECREF(item);
4708 /* fall through */
4709
Guido van Rossumd57fd912000-03-10 22:53:23 +00004710 onError:
Tim Peters8ce9f162004-08-27 01:49:32 +00004711 Py_XDECREF(internal_separator);
Tim Peters05eba1f2004-08-27 21:32:02 +00004712 Py_DECREF(fseq);
Tim Peters8ce9f162004-08-27 01:49:32 +00004713 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004714 return NULL;
4715}
4716
Tim Petersced69f82003-09-16 20:30:58 +00004717static
4718PyUnicodeObject *pad(PyUnicodeObject *self,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004719 Py_ssize_t left,
4720 Py_ssize_t right,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004721 Py_UNICODE fill)
4722{
4723 PyUnicodeObject *u;
4724
4725 if (left < 0)
4726 left = 0;
4727 if (right < 0)
4728 right = 0;
4729
Tim Peters7a29bd52001-09-12 03:03:31 +00004730 if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004731 Py_INCREF(self);
4732 return self;
4733 }
4734
4735 u = _PyUnicode_New(left + self->length + right);
4736 if (u) {
4737 if (left)
4738 Py_UNICODE_FILL(u->str, fill, left);
4739 Py_UNICODE_COPY(u->str + left, self->str, self->length);
4740 if (right)
4741 Py_UNICODE_FILL(u->str + left + self->length, fill, right);
4742 }
4743
4744 return u;
4745}
4746
4747#define SPLIT_APPEND(data, left, right) \
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004748 str = PyUnicode_FromUnicode((data) + (left), (right) - (left)); \
Guido van Rossumd57fd912000-03-10 22:53:23 +00004749 if (!str) \
4750 goto onError; \
4751 if (PyList_Append(list, str)) { \
4752 Py_DECREF(str); \
4753 goto onError; \
4754 } \
4755 else \
4756 Py_DECREF(str);
4757
4758static
4759PyObject *split_whitespace(PyUnicodeObject *self,
4760 PyObject *list,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004761 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004762{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004763 register Py_ssize_t i;
4764 register Py_ssize_t j;
4765 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004766 PyObject *str;
4767
4768 for (i = j = 0; i < len; ) {
4769 /* find a token */
4770 while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
4771 i++;
4772 j = i;
4773 while (i < len && !Py_UNICODE_ISSPACE(self->str[i]))
4774 i++;
4775 if (j < i) {
4776 if (maxcount-- <= 0)
4777 break;
4778 SPLIT_APPEND(self->str, j, i);
4779 while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
4780 i++;
4781 j = i;
4782 }
4783 }
4784 if (j < len) {
4785 SPLIT_APPEND(self->str, j, len);
4786 }
4787 return list;
4788
4789 onError:
4790 Py_DECREF(list);
4791 return NULL;
4792}
4793
4794PyObject *PyUnicode_Splitlines(PyObject *string,
Guido van Rossum86662912000-04-11 15:38:46 +00004795 int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004796{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004797 register Py_ssize_t i;
4798 register Py_ssize_t j;
4799 Py_ssize_t len;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004800 PyObject *list;
4801 PyObject *str;
4802 Py_UNICODE *data;
4803
4804 string = PyUnicode_FromObject(string);
4805 if (string == NULL)
4806 return NULL;
4807 data = PyUnicode_AS_UNICODE(string);
4808 len = PyUnicode_GET_SIZE(string);
4809
Guido van Rossumd57fd912000-03-10 22:53:23 +00004810 list = PyList_New(0);
4811 if (!list)
4812 goto onError;
4813
4814 for (i = j = 0; i < len; ) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00004815 Py_ssize_t eol;
Tim Petersced69f82003-09-16 20:30:58 +00004816
Guido van Rossumd57fd912000-03-10 22:53:23 +00004817 /* Find a line and append it */
Thomas Wouters477c8d52006-05-27 19:21:47 +00004818 while (i < len && !BLOOM_LINEBREAK(data[i]))
Guido van Rossumd57fd912000-03-10 22:53:23 +00004819 i++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004820
4821 /* Skip the line break reading CRLF as one line break */
Guido van Rossum86662912000-04-11 15:38:46 +00004822 eol = i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004823 if (i < len) {
4824 if (data[i] == '\r' && i + 1 < len &&
4825 data[i+1] == '\n')
4826 i += 2;
4827 else
4828 i++;
Guido van Rossum86662912000-04-11 15:38:46 +00004829 if (keepends)
4830 eol = i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004831 }
Guido van Rossum86662912000-04-11 15:38:46 +00004832 SPLIT_APPEND(data, j, eol);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004833 j = i;
4834 }
4835 if (j < len) {
4836 SPLIT_APPEND(data, j, len);
4837 }
4838
4839 Py_DECREF(string);
4840 return list;
4841
4842 onError:
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00004843 Py_XDECREF(list);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004844 Py_DECREF(string);
4845 return NULL;
4846}
4847
Tim Petersced69f82003-09-16 20:30:58 +00004848static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004849PyObject *split_char(PyUnicodeObject *self,
4850 PyObject *list,
4851 Py_UNICODE ch,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004852 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004853{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004854 register Py_ssize_t i;
4855 register Py_ssize_t j;
4856 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004857 PyObject *str;
4858
4859 for (i = j = 0; i < len; ) {
4860 if (self->str[i] == ch) {
4861 if (maxcount-- <= 0)
4862 break;
4863 SPLIT_APPEND(self->str, j, i);
4864 i = j = i + 1;
4865 } else
4866 i++;
4867 }
4868 if (j <= len) {
4869 SPLIT_APPEND(self->str, j, len);
4870 }
4871 return list;
4872
4873 onError:
4874 Py_DECREF(list);
4875 return NULL;
4876}
4877
Tim Petersced69f82003-09-16 20:30:58 +00004878static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004879PyObject *split_substring(PyUnicodeObject *self,
4880 PyObject *list,
4881 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004882 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004883{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004884 register Py_ssize_t i;
4885 register Py_ssize_t j;
4886 Py_ssize_t len = self->length;
4887 Py_ssize_t sublen = substring->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004888 PyObject *str;
4889
Guido van Rossumcda4f9a2000-12-19 02:23:19 +00004890 for (i = j = 0; i <= len - sublen; ) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004891 if (Py_UNICODE_MATCH(self, i, substring)) {
4892 if (maxcount-- <= 0)
4893 break;
4894 SPLIT_APPEND(self->str, j, i);
4895 i = j = i + sublen;
4896 } else
4897 i++;
4898 }
4899 if (j <= len) {
4900 SPLIT_APPEND(self->str, j, len);
4901 }
4902 return list;
4903
4904 onError:
4905 Py_DECREF(list);
4906 return NULL;
4907}
4908
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004909static
4910PyObject *rsplit_whitespace(PyUnicodeObject *self,
4911 PyObject *list,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004912 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004913{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004914 register Py_ssize_t i;
4915 register Py_ssize_t j;
4916 Py_ssize_t len = self->length;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004917 PyObject *str;
4918
4919 for (i = j = len - 1; i >= 0; ) {
4920 /* find a token */
4921 while (i >= 0 && Py_UNICODE_ISSPACE(self->str[i]))
4922 i--;
4923 j = i;
4924 while (i >= 0 && !Py_UNICODE_ISSPACE(self->str[i]))
4925 i--;
4926 if (j > i) {
4927 if (maxcount-- <= 0)
4928 break;
Thomas Wouters477c8d52006-05-27 19:21:47 +00004929 SPLIT_APPEND(self->str, i + 1, j + 1);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004930 while (i >= 0 && Py_UNICODE_ISSPACE(self->str[i]))
4931 i--;
4932 j = i;
4933 }
4934 }
4935 if (j >= 0) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00004936 SPLIT_APPEND(self->str, 0, j + 1);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004937 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00004938 if (PyList_Reverse(list) < 0)
4939 goto onError;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004940 return list;
4941
4942 onError:
4943 Py_DECREF(list);
4944 return NULL;
4945}
4946
4947static
4948PyObject *rsplit_char(PyUnicodeObject *self,
4949 PyObject *list,
4950 Py_UNICODE ch,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004951 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004952{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004953 register Py_ssize_t i;
4954 register Py_ssize_t j;
4955 Py_ssize_t len = self->length;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004956 PyObject *str;
4957
4958 for (i = j = len - 1; i >= 0; ) {
4959 if (self->str[i] == ch) {
4960 if (maxcount-- <= 0)
4961 break;
Thomas Wouters477c8d52006-05-27 19:21:47 +00004962 SPLIT_APPEND(self->str, i + 1, j + 1);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004963 j = i = i - 1;
4964 } else
4965 i--;
4966 }
Hye-Shik Chang7fc4cf52003-12-23 09:10:16 +00004967 if (j >= -1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00004968 SPLIT_APPEND(self->str, 0, j + 1);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004969 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00004970 if (PyList_Reverse(list) < 0)
4971 goto onError;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004972 return list;
4973
4974 onError:
4975 Py_DECREF(list);
4976 return NULL;
4977}
4978
4979static
4980PyObject *rsplit_substring(PyUnicodeObject *self,
4981 PyObject *list,
4982 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004983 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004984{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004985 register Py_ssize_t i;
4986 register Py_ssize_t j;
4987 Py_ssize_t len = self->length;
4988 Py_ssize_t sublen = substring->length;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004989 PyObject *str;
4990
4991 for (i = len - sublen, j = len; i >= 0; ) {
4992 if (Py_UNICODE_MATCH(self, i, substring)) {
4993 if (maxcount-- <= 0)
4994 break;
Thomas Wouters477c8d52006-05-27 19:21:47 +00004995 SPLIT_APPEND(self->str, i + sublen, j);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004996 j = i;
4997 i -= sublen;
4998 } else
4999 i--;
5000 }
5001 if (j >= 0) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00005002 SPLIT_APPEND(self->str, 0, j);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005003 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00005004 if (PyList_Reverse(list) < 0)
5005 goto onError;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005006 return list;
5007
5008 onError:
5009 Py_DECREF(list);
5010 return NULL;
5011}
5012
Guido van Rossumd57fd912000-03-10 22:53:23 +00005013#undef SPLIT_APPEND
5014
5015static
5016PyObject *split(PyUnicodeObject *self,
5017 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005018 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005019{
5020 PyObject *list;
5021
5022 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005023 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005024
5025 list = PyList_New(0);
5026 if (!list)
5027 return NULL;
5028
5029 if (substring == NULL)
5030 return split_whitespace(self,list,maxcount);
5031
5032 else if (substring->length == 1)
5033 return split_char(self,list,substring->str[0],maxcount);
5034
5035 else if (substring->length == 0) {
5036 Py_DECREF(list);
5037 PyErr_SetString(PyExc_ValueError, "empty separator");
5038 return NULL;
5039 }
5040 else
5041 return split_substring(self,list,substring,maxcount);
5042}
5043
Tim Petersced69f82003-09-16 20:30:58 +00005044static
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005045PyObject *rsplit(PyUnicodeObject *self,
5046 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005047 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005048{
5049 PyObject *list;
5050
5051 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005052 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005053
5054 list = PyList_New(0);
5055 if (!list)
5056 return NULL;
5057
5058 if (substring == NULL)
5059 return rsplit_whitespace(self,list,maxcount);
5060
5061 else if (substring->length == 1)
5062 return rsplit_char(self,list,substring->str[0],maxcount);
5063
5064 else if (substring->length == 0) {
5065 Py_DECREF(list);
5066 PyErr_SetString(PyExc_ValueError, "empty separator");
5067 return NULL;
5068 }
5069 else
5070 return rsplit_substring(self,list,substring,maxcount);
5071}
5072
5073static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005074PyObject *replace(PyUnicodeObject *self,
5075 PyUnicodeObject *str1,
5076 PyUnicodeObject *str2,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005077 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005078{
5079 PyUnicodeObject *u;
5080
5081 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005082 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005083
Thomas Wouters477c8d52006-05-27 19:21:47 +00005084 if (str1->length == str2->length) {
5085 /* same length */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005086 Py_ssize_t i;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005087 if (str1->length == 1) {
5088 /* replace characters */
5089 Py_UNICODE u1, u2;
5090 if (!findchar(self->str, self->length, str1->str[0]))
5091 goto nothing;
5092 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
5093 if (!u)
5094 return NULL;
5095 Py_UNICODE_COPY(u->str, self->str, self->length);
5096 u1 = str1->str[0];
5097 u2 = str2->str[0];
5098 for (i = 0; i < u->length; i++)
5099 if (u->str[i] == u1) {
5100 if (--maxcount < 0)
5101 break;
5102 u->str[i] = u2;
5103 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005104 } else {
Thomas Wouters477c8d52006-05-27 19:21:47 +00005105 i = fastsearch(
5106 self->str, self->length, str1->str, str1->length, FAST_SEARCH
Guido van Rossumd57fd912000-03-10 22:53:23 +00005107 );
Thomas Wouters477c8d52006-05-27 19:21:47 +00005108 if (i < 0)
5109 goto nothing;
5110 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
5111 if (!u)
5112 return NULL;
5113 Py_UNICODE_COPY(u->str, self->str, self->length);
5114 while (i <= self->length - str1->length)
5115 if (Py_UNICODE_MATCH(self, i, str1)) {
5116 if (--maxcount < 0)
5117 break;
5118 Py_UNICODE_COPY(u->str+i, str2->str, str2->length);
5119 i += str1->length;
5120 } else
5121 i++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005122 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005123 } else {
Thomas Wouters477c8d52006-05-27 19:21:47 +00005124
5125 Py_ssize_t n, i, j, e;
5126 Py_ssize_t product, new_size, delta;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005127 Py_UNICODE *p;
5128
5129 /* replace strings */
Thomas Wouters477c8d52006-05-27 19:21:47 +00005130 n = stringlib_count(self->str, self->length, str1->str, str1->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005131 if (n > maxcount)
5132 n = maxcount;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005133 if (n == 0)
5134 goto nothing;
5135 /* new_size = self->length + n * (str2->length - str1->length)); */
5136 delta = (str2->length - str1->length);
5137 if (delta == 0) {
5138 new_size = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005139 } else {
Thomas Wouters477c8d52006-05-27 19:21:47 +00005140 product = n * (str2->length - str1->length);
5141 if ((product / (str2->length - str1->length)) != n) {
5142 PyErr_SetString(PyExc_OverflowError,
5143 "replace string is too long");
5144 return NULL;
5145 }
5146 new_size = self->length + product;
5147 if (new_size < 0) {
5148 PyErr_SetString(PyExc_OverflowError,
5149 "replace string is too long");
5150 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005151 }
5152 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00005153 u = _PyUnicode_New(new_size);
5154 if (!u)
5155 return NULL;
5156 i = 0;
5157 p = u->str;
5158 e = self->length - str1->length;
5159 if (str1->length > 0) {
5160 while (n-- > 0) {
5161 /* look for next match */
5162 j = i;
5163 while (j <= e) {
5164 if (Py_UNICODE_MATCH(self, j, str1))
5165 break;
5166 j++;
5167 }
5168 if (j > i) {
5169 if (j > e)
5170 break;
5171 /* copy unchanged part [i:j] */
5172 Py_UNICODE_COPY(p, self->str+i, j-i);
5173 p += j - i;
5174 }
5175 /* copy substitution string */
5176 if (str2->length > 0) {
5177 Py_UNICODE_COPY(p, str2->str, str2->length);
5178 p += str2->length;
5179 }
5180 i = j + str1->length;
5181 }
5182 if (i < self->length)
5183 /* copy tail [i:] */
5184 Py_UNICODE_COPY(p, self->str+i, self->length-i);
5185 } else {
5186 /* interleave */
5187 while (n > 0) {
5188 Py_UNICODE_COPY(p, str2->str, str2->length);
5189 p += str2->length;
5190 if (--n <= 0)
5191 break;
5192 *p++ = self->str[i++];
5193 }
5194 Py_UNICODE_COPY(p, self->str+i, self->length-i);
5195 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005196 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005197 return (PyObject *) u;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005198
5199nothing:
5200 /* nothing to replace; return original string (when possible) */
5201 if (PyUnicode_CheckExact(self)) {
5202 Py_INCREF(self);
5203 return (PyObject *) self;
5204 }
5205 return PyUnicode_FromUnicode(self->str, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005206}
5207
5208/* --- Unicode Object Methods --------------------------------------------- */
5209
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005210PyDoc_STRVAR(title__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005211"S.title() -> unicode\n\
5212\n\
5213Return a titlecased version of S, i.e. words start with title case\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005214characters, all remaining cased characters have lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005215
5216static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005217unicode_title(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005218{
Guido van Rossumd57fd912000-03-10 22:53:23 +00005219 return fixup(self, fixtitle);
5220}
5221
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005222PyDoc_STRVAR(capitalize__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005223"S.capitalize() -> unicode\n\
5224\n\
5225Return a capitalized version of S, i.e. make the first character\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005226have upper case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005227
5228static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005229unicode_capitalize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005230{
Guido van Rossumd57fd912000-03-10 22:53:23 +00005231 return fixup(self, fixcapitalize);
5232}
5233
5234#if 0
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005235PyDoc_STRVAR(capwords__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005236"S.capwords() -> unicode\n\
5237\n\
5238Apply .capitalize() to all words in S and return the result with\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005239normalized whitespace (all whitespace strings are replaced by ' ').");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005240
5241static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005242unicode_capwords(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005243{
5244 PyObject *list;
5245 PyObject *item;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005246 Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005247
Guido van Rossumd57fd912000-03-10 22:53:23 +00005248 /* Split into words */
5249 list = split(self, NULL, -1);
5250 if (!list)
5251 return NULL;
5252
5253 /* Capitalize each word */
5254 for (i = 0; i < PyList_GET_SIZE(list); i++) {
5255 item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
5256 fixcapitalize);
5257 if (item == NULL)
5258 goto onError;
5259 Py_DECREF(PyList_GET_ITEM(list, i));
5260 PyList_SET_ITEM(list, i, item);
5261 }
5262
5263 /* Join the words to form a new string */
5264 item = PyUnicode_Join(NULL, list);
5265
5266onError:
5267 Py_DECREF(list);
5268 return (PyObject *)item;
5269}
5270#endif
5271
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00005272/* Argument converter. Coerces to a single unicode character */
5273
5274static int
5275convert_uc(PyObject *obj, void *addr)
5276{
5277 Py_UNICODE *fillcharloc = (Py_UNICODE *)addr;
5278 PyObject *uniobj;
5279 Py_UNICODE *unistr;
5280
5281 uniobj = PyUnicode_FromObject(obj);
5282 if (uniobj == NULL) {
5283 PyErr_SetString(PyExc_TypeError,
5284 "The fill character cannot be converted to Unicode");
5285 return 0;
5286 }
5287 if (PyUnicode_GET_SIZE(uniobj) != 1) {
5288 PyErr_SetString(PyExc_TypeError,
5289 "The fill character must be exactly one character long");
5290 Py_DECREF(uniobj);
5291 return 0;
5292 }
5293 unistr = PyUnicode_AS_UNICODE(uniobj);
5294 *fillcharloc = unistr[0];
5295 Py_DECREF(uniobj);
5296 return 1;
5297}
5298
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005299PyDoc_STRVAR(center__doc__,
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00005300"S.center(width[, fillchar]) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005301\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00005302Return S centered in a Unicode string of length width. Padding is\n\
5303done using the specified fill character (default is a space)");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005304
5305static PyObject *
5306unicode_center(PyUnicodeObject *self, PyObject *args)
5307{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005308 Py_ssize_t marg, left;
5309 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00005310 Py_UNICODE fillchar = ' ';
Guido van Rossumd57fd912000-03-10 22:53:23 +00005311
Thomas Woutersde017742006-02-16 19:34:37 +00005312 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005313 return NULL;
5314
Tim Peters7a29bd52001-09-12 03:03:31 +00005315 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005316 Py_INCREF(self);
5317 return (PyObject*) self;
5318 }
5319
5320 marg = width - self->length;
5321 left = marg / 2 + (marg & width & 1);
5322
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00005323 return (PyObject*) pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005324}
5325
Marc-André Lemburge5034372000-08-08 08:04:29 +00005326#if 0
5327
5328/* This code should go into some future Unicode collation support
5329 module. The basic comparison should compare ordinals on a naive
Trent Mick20abf572000-08-12 22:14:34 +00005330 basis (this is what Java does and thus JPython too). */
Marc-André Lemburge5034372000-08-08 08:04:29 +00005331
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005332/* speedy UTF-16 code point order comparison */
5333/* gleaned from: */
5334/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
5335
Marc-André Lemburge12896e2000-07-07 17:51:08 +00005336static short utf16Fixup[32] =
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005337{
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005338 0, 0, 0, 0, 0, 0, 0, 0,
Tim Petersced69f82003-09-16 20:30:58 +00005339 0, 0, 0, 0, 0, 0, 0, 0,
5340 0, 0, 0, 0, 0, 0, 0, 0,
Marc-André Lemburge12896e2000-07-07 17:51:08 +00005341 0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005342};
5343
Guido van Rossumd57fd912000-03-10 22:53:23 +00005344static int
5345unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
5346{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005347 Py_ssize_t len1, len2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005348
Guido van Rossumd57fd912000-03-10 22:53:23 +00005349 Py_UNICODE *s1 = str1->str;
5350 Py_UNICODE *s2 = str2->str;
5351
5352 len1 = str1->length;
5353 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00005354
Guido van Rossumd57fd912000-03-10 22:53:23 +00005355 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00005356 Py_UNICODE c1, c2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005357
5358 c1 = *s1++;
5359 c2 = *s2++;
Fredrik Lundh45714e92001-06-26 16:39:36 +00005360
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005361 if (c1 > (1<<11) * 26)
5362 c1 += utf16Fixup[c1>>11];
5363 if (c2 > (1<<11) * 26)
5364 c2 += utf16Fixup[c2>>11];
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005365 /* now c1 and c2 are in UTF-32-compatible order */
Fredrik Lundh45714e92001-06-26 16:39:36 +00005366
5367 if (c1 != c2)
5368 return (c1 < c2) ? -1 : 1;
Tim Petersced69f82003-09-16 20:30:58 +00005369
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005370 len1--; len2--;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005371 }
5372
5373 return (len1 < len2) ? -1 : (len1 != len2);
5374}
5375
Marc-André Lemburge5034372000-08-08 08:04:29 +00005376#else
5377
5378static int
5379unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
5380{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005381 register Py_ssize_t len1, len2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00005382
5383 Py_UNICODE *s1 = str1->str;
5384 Py_UNICODE *s2 = str2->str;
5385
5386 len1 = str1->length;
5387 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00005388
Marc-André Lemburge5034372000-08-08 08:04:29 +00005389 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00005390 Py_UNICODE c1, c2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00005391
Fredrik Lundh45714e92001-06-26 16:39:36 +00005392 c1 = *s1++;
5393 c2 = *s2++;
5394
5395 if (c1 != c2)
5396 return (c1 < c2) ? -1 : 1;
5397
Marc-André Lemburge5034372000-08-08 08:04:29 +00005398 len1--; len2--;
5399 }
5400
5401 return (len1 < len2) ? -1 : (len1 != len2);
5402}
5403
5404#endif
5405
Guido van Rossumd57fd912000-03-10 22:53:23 +00005406int PyUnicode_Compare(PyObject *left,
5407 PyObject *right)
5408{
Guido van Rossum09dc34f2007-05-04 04:17:33 +00005409 if (PyUnicode_Check(left) && PyUnicode_Check(right))
5410 return unicode_compare((PyUnicodeObject *)left,
5411 (PyUnicodeObject *)right);
5412 if ((PyString_Check(left) && PyUnicode_Check(right)) ||
5413 (PyUnicode_Check(left) && PyString_Check(right))) {
5414 if (PyUnicode_Check(left))
5415 left = _PyUnicode_AsDefaultEncodedString(left, NULL);
5416 if (PyUnicode_Check(right))
5417 right = _PyUnicode_AsDefaultEncodedString(right, NULL);
5418 assert(PyString_Check(left));
5419 assert(PyString_Check(right));
5420 return PyObject_Compare(left, right);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005421 }
Guido van Rossum09dc34f2007-05-04 04:17:33 +00005422 PyErr_Format(PyExc_TypeError,
5423 "Can't compare %.100s and %.100s",
5424 left->ob_type->tp_name,
5425 right->ob_type->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005426 return -1;
5427}
5428
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00005429PyObject *PyUnicode_RichCompare(PyObject *left,
5430 PyObject *right,
5431 int op)
5432{
5433 int result;
5434
5435 result = PyUnicode_Compare(left, right);
5436 if (result == -1 && PyErr_Occurred())
5437 goto onError;
5438
5439 /* Convert the return value to a Boolean */
5440 switch (op) {
5441 case Py_EQ:
5442 result = (result == 0);
5443 break;
5444 case Py_NE:
5445 result = (result != 0);
5446 break;
5447 case Py_LE:
5448 result = (result <= 0);
5449 break;
5450 case Py_GE:
5451 result = (result >= 0);
5452 break;
5453 case Py_LT:
5454 result = (result == -1);
5455 break;
5456 case Py_GT:
5457 result = (result == 1);
5458 break;
5459 }
5460 return PyBool_FromLong(result);
5461
5462 onError:
5463
5464 /* Standard case
5465
5466 Type errors mean that PyUnicode_FromObject() could not convert
5467 one of the arguments (usually the right hand side) to Unicode,
5468 ie. we can't handle the comparison request. However, it is
5469 possible that the other object knows a comparison method, which
5470 is why we return Py_NotImplemented to give the other object a
5471 chance.
5472
5473 */
5474 if (PyErr_ExceptionMatches(PyExc_TypeError)) {
5475 PyErr_Clear();
5476 Py_INCREF(Py_NotImplemented);
5477 return Py_NotImplemented;
5478 }
5479 if (op != Py_EQ && op != Py_NE)
5480 return NULL;
5481
5482 /* Equality comparison.
5483
5484 This is a special case: we silence any PyExc_UnicodeDecodeError
5485 and instead turn it into a PyErr_UnicodeWarning.
5486
5487 */
5488 if (!PyErr_ExceptionMatches(PyExc_UnicodeDecodeError))
5489 return NULL;
5490 PyErr_Clear();
5491 if (PyErr_Warn(PyExc_UnicodeWarning,
5492 (op == Py_EQ) ?
5493 "Unicode equal comparison "
5494 "failed to convert both arguments to Unicode - "
5495 "interpreting them as being unequal" :
5496 "Unicode unequal comparison "
5497 "failed to convert both arguments to Unicode - "
5498 "interpreting them as being unequal"
5499 ) < 0)
5500 return NULL;
5501 result = (op == Py_NE);
5502 return PyBool_FromLong(result);
5503}
5504
Guido van Rossum403d68b2000-03-13 15:55:09 +00005505int PyUnicode_Contains(PyObject *container,
5506 PyObject *element)
5507{
Thomas Wouters477c8d52006-05-27 19:21:47 +00005508 PyObject *str, *sub;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005509 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00005510
5511 /* Coerce the two arguments */
Thomas Wouters477c8d52006-05-27 19:21:47 +00005512 sub = PyUnicode_FromObject(element);
5513 if (!sub) {
Marc-André Lemburg7c014682000-06-28 08:11:47 +00005514 PyErr_SetString(PyExc_TypeError,
Barry Warsaw817918c2002-08-06 16:58:21 +00005515 "'in <string>' requires string as left operand");
Thomas Wouters477c8d52006-05-27 19:21:47 +00005516 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +00005517 }
5518
Thomas Wouters477c8d52006-05-27 19:21:47 +00005519 str = PyUnicode_FromObject(container);
5520 if (!str) {
5521 Py_DECREF(sub);
5522 return -1;
5523 }
5524
5525 result = stringlib_contains_obj(str, sub);
5526
5527 Py_DECREF(str);
5528 Py_DECREF(sub);
5529
Guido van Rossum403d68b2000-03-13 15:55:09 +00005530 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00005531}
5532
Guido van Rossumd57fd912000-03-10 22:53:23 +00005533/* Concat to string or Unicode object giving a new Unicode object. */
5534
5535PyObject *PyUnicode_Concat(PyObject *left,
5536 PyObject *right)
5537{
5538 PyUnicodeObject *u = NULL, *v = NULL, *w;
5539
Guido van Rossum84d79dd2007-04-13 02:23:57 +00005540 if (PyBytes_Check(left) || PyBytes_Check(right))
5541 return PyBytes_Concat(left, right);
5542
Guido van Rossumd57fd912000-03-10 22:53:23 +00005543 /* Coerce the two arguments */
5544 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
5545 if (u == NULL)
5546 goto onError;
5547 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
5548 if (v == NULL)
5549 goto onError;
5550
5551 /* Shortcuts */
5552 if (v == unicode_empty) {
5553 Py_DECREF(v);
5554 return (PyObject *)u;
5555 }
5556 if (u == unicode_empty) {
5557 Py_DECREF(u);
5558 return (PyObject *)v;
5559 }
5560
5561 /* Concat the two Unicode strings */
5562 w = _PyUnicode_New(u->length + v->length);
5563 if (w == NULL)
5564 goto onError;
5565 Py_UNICODE_COPY(w->str, u->str, u->length);
5566 Py_UNICODE_COPY(w->str + u->length, v->str, v->length);
5567
5568 Py_DECREF(u);
5569 Py_DECREF(v);
5570 return (PyObject *)w;
5571
5572onError:
5573 Py_XDECREF(u);
5574 Py_XDECREF(v);
5575 return NULL;
5576}
5577
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005578PyDoc_STRVAR(count__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005579"S.count(sub[, start[, end]]) -> int\n\
5580\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00005581Return the number of non-overlapping occurrences of substring sub in\n\
5582Unicode string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005583interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005584
5585static PyObject *
5586unicode_count(PyUnicodeObject *self, PyObject *args)
5587{
5588 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005589 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005590 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005591 PyObject *result;
5592
Guido van Rossumb8872e62000-05-09 14:14:27 +00005593 if (!PyArg_ParseTuple(args, "O|O&O&:count", &substring,
5594 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005595 return NULL;
5596
5597 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Thomas Wouters477c8d52006-05-27 19:21:47 +00005598 (PyObject *)substring);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005599 if (substring == NULL)
5600 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +00005601
Thomas Wouters477c8d52006-05-27 19:21:47 +00005602 FIX_START_END(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005603
Thomas Wouters477c8d52006-05-27 19:21:47 +00005604 result = PyInt_FromSsize_t(
5605 stringlib_count(self->str + start, end - start,
5606 substring->str, substring->length)
5607 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00005608
5609 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00005610
Guido van Rossumd57fd912000-03-10 22:53:23 +00005611 return result;
5612}
5613
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005614PyDoc_STRVAR(encode__doc__,
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00005615"S.encode([encoding[,errors]]) -> string or unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005616\n\
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00005617Encodes S using the codec registered for encoding. encoding defaults\n\
5618to the default encoding. errors may be given to set a different error\n\
Fred Drakee4315f52000-05-09 19:53:39 +00005619handling scheme. Default is 'strict' meaning that encoding errors raise\n\
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005620a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
5621'xmlcharrefreplace' as well as any other name registered with\n\
5622codecs.register_error that can handle UnicodeEncodeErrors.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005623
5624static PyObject *
5625unicode_encode(PyUnicodeObject *self, PyObject *args)
5626{
5627 char *encoding = NULL;
5628 char *errors = NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00005629 PyObject *v;
5630
Guido van Rossumd57fd912000-03-10 22:53:23 +00005631 if (!PyArg_ParseTuple(args, "|ss:encode", &encoding, &errors))
5632 return NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00005633 v = PyUnicode_AsEncodedObject((PyObject *)self, encoding, errors);
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00005634 if (v == NULL)
5635 goto onError;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00005636 if (!PyBytes_Check(v)) {
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00005637 PyErr_Format(PyExc_TypeError,
Guido van Rossumf15a29f2007-05-04 00:41:39 +00005638 "encoder did not return a bytes object "
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00005639 "(type=%.400s)",
5640 v->ob_type->tp_name);
5641 Py_DECREF(v);
5642 return NULL;
5643 }
5644 return v;
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00005645
5646 onError:
5647 return NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00005648}
5649
5650PyDoc_STRVAR(decode__doc__,
5651"S.decode([encoding[,errors]]) -> string or unicode\n\
5652\n\
5653Decodes S using the codec registered for encoding. encoding defaults\n\
5654to the default encoding. errors may be given to set a different error\n\
5655handling scheme. Default is 'strict' meaning that encoding errors raise\n\
5656a UnicodeDecodeError. Other possible values are 'ignore' and 'replace'\n\
5657as well as any other name registerd with codecs.register_error that is\n\
5658able to handle UnicodeDecodeErrors.");
5659
5660static PyObject *
Marc-André Lemburg126b44c2004-07-10 12:04:20 +00005661unicode_decode(PyUnicodeObject *self, PyObject *args)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00005662{
5663 char *encoding = NULL;
5664 char *errors = NULL;
5665 PyObject *v;
5666
5667 if (!PyArg_ParseTuple(args, "|ss:decode", &encoding, &errors))
5668 return NULL;
5669 v = PyUnicode_AsDecodedObject((PyObject *)self, encoding, errors);
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00005670 if (v == NULL)
5671 goto onError;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00005672 if (!PyString_Check(v) && !PyUnicode_Check(v)) {
5673 PyErr_Format(PyExc_TypeError,
5674 "decoder did not return a string/unicode object "
5675 "(type=%.400s)",
5676 v->ob_type->tp_name);
5677 Py_DECREF(v);
5678 return NULL;
5679 }
5680 return v;
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00005681
5682 onError:
5683 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005684}
5685
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005686PyDoc_STRVAR(expandtabs__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005687"S.expandtabs([tabsize]) -> unicode\n\
5688\n\
5689Return a copy of S where all tab characters are expanded using spaces.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005690If tabsize is not given, a tab size of 8 characters is assumed.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005691
5692static PyObject*
5693unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
5694{
5695 Py_UNICODE *e;
5696 Py_UNICODE *p;
5697 Py_UNICODE *q;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005698 Py_ssize_t i, j;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005699 PyUnicodeObject *u;
5700 int tabsize = 8;
5701
5702 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
5703 return NULL;
5704
Thomas Wouters7e474022000-07-16 12:04:32 +00005705 /* First pass: determine size of output string */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005706 i = j = 0;
5707 e = self->str + self->length;
5708 for (p = self->str; p < e; p++)
5709 if (*p == '\t') {
5710 if (tabsize > 0)
5711 j += tabsize - (j % tabsize);
5712 }
5713 else {
5714 j++;
5715 if (*p == '\n' || *p == '\r') {
5716 i += j;
5717 j = 0;
5718 }
5719 }
5720
5721 /* Second pass: create output string and fill it */
5722 u = _PyUnicode_New(i + j);
5723 if (!u)
5724 return NULL;
5725
5726 j = 0;
5727 q = u->str;
5728
5729 for (p = self->str; p < e; p++)
5730 if (*p == '\t') {
5731 if (tabsize > 0) {
5732 i = tabsize - (j % tabsize);
5733 j += i;
5734 while (i--)
5735 *q++ = ' ';
5736 }
5737 }
5738 else {
5739 j++;
5740 *q++ = *p;
5741 if (*p == '\n' || *p == '\r')
5742 j = 0;
5743 }
5744
5745 return (PyObject*) u;
5746}
5747
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005748PyDoc_STRVAR(find__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005749"S.find(sub [,start [,end]]) -> int\n\
5750\n\
5751Return the lowest index in S where substring sub is found,\n\
5752such that sub is contained within s[start,end]. Optional\n\
5753arguments start and end are interpreted as in slice notation.\n\
5754\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005755Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005756
5757static PyObject *
5758unicode_find(PyUnicodeObject *self, PyObject *args)
5759{
Thomas Wouters477c8d52006-05-27 19:21:47 +00005760 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005761 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005762 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005763 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005764
Guido van Rossumb8872e62000-05-09 14:14:27 +00005765 if (!PyArg_ParseTuple(args, "O|O&O&:find", &substring,
5766 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005767 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005768 substring = PyUnicode_FromObject(substring);
5769 if (!substring)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005770 return NULL;
5771
Thomas Wouters477c8d52006-05-27 19:21:47 +00005772 result = stringlib_find_slice(
5773 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
5774 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
5775 start, end
5776 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00005777
5778 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00005779
5780 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005781}
5782
5783static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +00005784unicode_getitem(PyUnicodeObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005785{
5786 if (index < 0 || index >= self->length) {
5787 PyErr_SetString(PyExc_IndexError, "string index out of range");
5788 return NULL;
5789 }
5790
5791 return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1);
5792}
5793
5794static long
Guido van Rossum09dc34f2007-05-04 04:17:33 +00005795unicode_hash(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005796{
Guido van Rossum09dc34f2007-05-04 04:17:33 +00005797 /* Since Unicode objects compare equal to their UTF-8 string
5798 counterparts, we hash the UTF-8 string. */
5799 PyObject *v = _PyUnicode_AsDefaultEncodedString(self, NULL);
5800 return PyObject_Hash(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005801}
5802
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005803PyDoc_STRVAR(index__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005804"S.index(sub [,start [,end]]) -> int\n\
5805\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005806Like S.find() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005807
5808static PyObject *
5809unicode_index(PyUnicodeObject *self, PyObject *args)
5810{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005811 Py_ssize_t result;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005812 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005813 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005814 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005815
Guido van Rossumb8872e62000-05-09 14:14:27 +00005816 if (!PyArg_ParseTuple(args, "O|O&O&:index", &substring,
5817 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005818 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005819 substring = PyUnicode_FromObject(substring);
5820 if (!substring)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005821 return NULL;
5822
Thomas Wouters477c8d52006-05-27 19:21:47 +00005823 result = stringlib_find_slice(
5824 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
5825 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
5826 start, end
5827 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00005828
5829 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00005830
Guido van Rossumd57fd912000-03-10 22:53:23 +00005831 if (result < 0) {
5832 PyErr_SetString(PyExc_ValueError, "substring not found");
5833 return NULL;
5834 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00005835
Martin v. Löwis18e16552006-02-15 17:27:45 +00005836 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005837}
5838
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005839PyDoc_STRVAR(islower__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005840"S.islower() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005841\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00005842Return True if all cased characters in S are lowercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005843at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005844
5845static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005846unicode_islower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005847{
5848 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5849 register const Py_UNICODE *e;
5850 int cased;
5851
Guido van Rossumd57fd912000-03-10 22:53:23 +00005852 /* Shortcut for single character strings */
5853 if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005854 return PyBool_FromLong(Py_UNICODE_ISLOWER(*p));
Guido van Rossumd57fd912000-03-10 22:53:23 +00005855
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005856 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00005857 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005858 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005859
Guido van Rossumd57fd912000-03-10 22:53:23 +00005860 e = p + PyUnicode_GET_SIZE(self);
5861 cased = 0;
5862 for (; p < e; p++) {
5863 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00005864
Guido van Rossumd57fd912000-03-10 22:53:23 +00005865 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005866 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005867 else if (!cased && Py_UNICODE_ISLOWER(ch))
5868 cased = 1;
5869 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00005870 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005871}
5872
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005873PyDoc_STRVAR(isupper__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005874"S.isupper() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005875\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00005876Return True if all cased characters in S are uppercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005877at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005878
5879static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005880unicode_isupper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005881{
5882 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5883 register const Py_UNICODE *e;
5884 int cased;
5885
Guido van Rossumd57fd912000-03-10 22:53:23 +00005886 /* Shortcut for single character strings */
5887 if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005888 return PyBool_FromLong(Py_UNICODE_ISUPPER(*p) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005889
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005890 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00005891 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005892 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005893
Guido van Rossumd57fd912000-03-10 22:53:23 +00005894 e = p + PyUnicode_GET_SIZE(self);
5895 cased = 0;
5896 for (; p < e; p++) {
5897 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00005898
Guido van Rossumd57fd912000-03-10 22:53:23 +00005899 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005900 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005901 else if (!cased && Py_UNICODE_ISUPPER(ch))
5902 cased = 1;
5903 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00005904 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005905}
5906
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005907PyDoc_STRVAR(istitle__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005908"S.istitle() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005909\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00005910Return True if S is a titlecased string and there is at least one\n\
5911character in S, i.e. upper- and titlecase characters may only\n\
5912follow uncased characters and lowercase characters only cased ones.\n\
5913Return False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005914
5915static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005916unicode_istitle(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005917{
5918 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5919 register const Py_UNICODE *e;
5920 int cased, previous_is_cased;
5921
Guido van Rossumd57fd912000-03-10 22:53:23 +00005922 /* Shortcut for single character strings */
5923 if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005924 return PyBool_FromLong((Py_UNICODE_ISTITLE(*p) != 0) ||
5925 (Py_UNICODE_ISUPPER(*p) != 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +00005926
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005927 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00005928 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005929 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005930
Guido van Rossumd57fd912000-03-10 22:53:23 +00005931 e = p + PyUnicode_GET_SIZE(self);
5932 cased = 0;
5933 previous_is_cased = 0;
5934 for (; p < e; p++) {
5935 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00005936
Guido van Rossumd57fd912000-03-10 22:53:23 +00005937 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
5938 if (previous_is_cased)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005939 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005940 previous_is_cased = 1;
5941 cased = 1;
5942 }
5943 else if (Py_UNICODE_ISLOWER(ch)) {
5944 if (!previous_is_cased)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005945 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005946 previous_is_cased = 1;
5947 cased = 1;
5948 }
5949 else
5950 previous_is_cased = 0;
5951 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00005952 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005953}
5954
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005955PyDoc_STRVAR(isspace__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005956"S.isspace() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005957\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00005958Return True if all characters in S are whitespace\n\
5959and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005960
5961static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005962unicode_isspace(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005963{
5964 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5965 register const Py_UNICODE *e;
5966
Guido van Rossumd57fd912000-03-10 22:53:23 +00005967 /* Shortcut for single character strings */
5968 if (PyUnicode_GET_SIZE(self) == 1 &&
5969 Py_UNICODE_ISSPACE(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005970 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005971
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005972 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00005973 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005974 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005975
Guido van Rossumd57fd912000-03-10 22:53:23 +00005976 e = p + PyUnicode_GET_SIZE(self);
5977 for (; p < e; p++) {
5978 if (!Py_UNICODE_ISSPACE(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005979 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005980 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00005981 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005982}
5983
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005984PyDoc_STRVAR(isalpha__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005985"S.isalpha() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005986\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00005987Return True if all characters in S are alphabetic\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005988and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005989
5990static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005991unicode_isalpha(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005992{
5993 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5994 register const Py_UNICODE *e;
5995
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005996 /* Shortcut for single character strings */
5997 if (PyUnicode_GET_SIZE(self) == 1 &&
5998 Py_UNICODE_ISALPHA(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005999 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006000
6001 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006002 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006003 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006004
6005 e = p + PyUnicode_GET_SIZE(self);
6006 for (; p < e; p++) {
6007 if (!Py_UNICODE_ISALPHA(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006008 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006009 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006010 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006011}
6012
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006013PyDoc_STRVAR(isalnum__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006014"S.isalnum() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006015\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006016Return True if all characters in S are alphanumeric\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006017and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006018
6019static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006020unicode_isalnum(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006021{
6022 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6023 register const Py_UNICODE *e;
6024
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006025 /* Shortcut for single character strings */
6026 if (PyUnicode_GET_SIZE(self) == 1 &&
6027 Py_UNICODE_ISALNUM(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006028 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006029
6030 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006031 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006032 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006033
6034 e = p + PyUnicode_GET_SIZE(self);
6035 for (; p < e; p++) {
6036 if (!Py_UNICODE_ISALNUM(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006037 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006038 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006039 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006040}
6041
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006042PyDoc_STRVAR(isdecimal__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006043"S.isdecimal() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006044\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00006045Return True if there are only decimal characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006046False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006047
6048static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006049unicode_isdecimal(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006050{
6051 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6052 register const Py_UNICODE *e;
6053
Guido van Rossumd57fd912000-03-10 22:53:23 +00006054 /* Shortcut for single character strings */
6055 if (PyUnicode_GET_SIZE(self) == 1 &&
6056 Py_UNICODE_ISDECIMAL(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006057 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006058
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006059 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006060 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006061 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006062
Guido van Rossumd57fd912000-03-10 22:53:23 +00006063 e = p + PyUnicode_GET_SIZE(self);
6064 for (; p < e; p++) {
6065 if (!Py_UNICODE_ISDECIMAL(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006066 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006067 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006068 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006069}
6070
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006071PyDoc_STRVAR(isdigit__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006072"S.isdigit() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006073\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006074Return True if all characters in S are digits\n\
6075and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006076
6077static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006078unicode_isdigit(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006079{
6080 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6081 register const Py_UNICODE *e;
6082
Guido van Rossumd57fd912000-03-10 22:53:23 +00006083 /* Shortcut for single character strings */
6084 if (PyUnicode_GET_SIZE(self) == 1 &&
6085 Py_UNICODE_ISDIGIT(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006086 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006087
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006088 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006089 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006090 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006091
Guido van Rossumd57fd912000-03-10 22:53:23 +00006092 e = p + PyUnicode_GET_SIZE(self);
6093 for (; p < e; p++) {
6094 if (!Py_UNICODE_ISDIGIT(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006095 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006096 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006097 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006098}
6099
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006100PyDoc_STRVAR(isnumeric__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006101"S.isnumeric() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006102\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00006103Return True if there are only numeric characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006104False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006105
6106static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006107unicode_isnumeric(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006108{
6109 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6110 register const Py_UNICODE *e;
6111
Guido van Rossumd57fd912000-03-10 22:53:23 +00006112 /* Shortcut for single character strings */
6113 if (PyUnicode_GET_SIZE(self) == 1 &&
6114 Py_UNICODE_ISNUMERIC(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006115 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006116
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006117 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006118 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006119 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006120
Guido van Rossumd57fd912000-03-10 22:53:23 +00006121 e = p + PyUnicode_GET_SIZE(self);
6122 for (; p < e; p++) {
6123 if (!Py_UNICODE_ISNUMERIC(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006124 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006125 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006126 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006127}
6128
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006129PyDoc_STRVAR(join__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006130"S.join(sequence) -> unicode\n\
6131\n\
6132Return a string which is the concatenation of the strings in the\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006133sequence. The separator between elements is S.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006134
6135static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006136unicode_join(PyObject *self, PyObject *data)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006137{
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006138 return PyUnicode_Join(self, data);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006139}
6140
Martin v. Löwis18e16552006-02-15 17:27:45 +00006141static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +00006142unicode_length(PyUnicodeObject *self)
6143{
6144 return self->length;
6145}
6146
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006147PyDoc_STRVAR(ljust__doc__,
Hye-Shik Chang974ed7c2004-06-02 16:49:17 +00006148"S.ljust(width[, fillchar]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006149\n\
6150Return S left justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006151done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006152
6153static PyObject *
6154unicode_ljust(PyUnicodeObject *self, PyObject *args)
6155{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006156 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006157 Py_UNICODE fillchar = ' ';
6158
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006159 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006160 return NULL;
6161
Tim Peters7a29bd52001-09-12 03:03:31 +00006162 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006163 Py_INCREF(self);
6164 return (PyObject*) self;
6165 }
6166
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006167 return (PyObject*) pad(self, 0, width - self->length, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006168}
6169
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006170PyDoc_STRVAR(lower__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006171"S.lower() -> unicode\n\
6172\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006173Return a copy of the string S converted to lowercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006174
6175static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006176unicode_lower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006177{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006178 return fixup(self, fixlower);
6179}
6180
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006181#define LEFTSTRIP 0
6182#define RIGHTSTRIP 1
6183#define BOTHSTRIP 2
6184
6185/* Arrays indexed by above */
6186static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
6187
6188#define STRIPNAME(i) (stripformat[i]+3)
6189
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006190/* externally visible for str.strip(unicode) */
6191PyObject *
6192_PyUnicode_XStrip(PyUnicodeObject *self, int striptype, PyObject *sepobj)
6193{
6194 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
Martin v. Löwis18e16552006-02-15 17:27:45 +00006195 Py_ssize_t len = PyUnicode_GET_SIZE(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006196 Py_UNICODE *sep = PyUnicode_AS_UNICODE(sepobj);
Martin v. Löwis18e16552006-02-15 17:27:45 +00006197 Py_ssize_t seplen = PyUnicode_GET_SIZE(sepobj);
6198 Py_ssize_t i, j;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006199
Thomas Wouters477c8d52006-05-27 19:21:47 +00006200 BLOOM_MASK sepmask = make_bloom_mask(sep, seplen);
6201
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006202 i = 0;
6203 if (striptype != RIGHTSTRIP) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00006204 while (i < len && BLOOM_MEMBER(sepmask, s[i], sep, seplen)) {
6205 i++;
6206 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006207 }
6208
6209 j = len;
6210 if (striptype != LEFTSTRIP) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00006211 do {
6212 j--;
6213 } while (j >= i && BLOOM_MEMBER(sepmask, s[j], sep, seplen));
6214 j++;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006215 }
6216
6217 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00006218 Py_INCREF(self);
6219 return (PyObject*)self;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006220 }
6221 else
Thomas Wouters477c8d52006-05-27 19:21:47 +00006222 return PyUnicode_FromUnicode(s+i, j-i);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006223}
6224
Guido van Rossumd57fd912000-03-10 22:53:23 +00006225
6226static PyObject *
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006227do_strip(PyUnicodeObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006228{
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006229 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
Martin v. Löwis18e16552006-02-15 17:27:45 +00006230 Py_ssize_t len = PyUnicode_GET_SIZE(self), i, j;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006231
6232 i = 0;
6233 if (striptype != RIGHTSTRIP) {
6234 while (i < len && Py_UNICODE_ISSPACE(s[i])) {
6235 i++;
6236 }
6237 }
6238
6239 j = len;
6240 if (striptype != LEFTSTRIP) {
6241 do {
6242 j--;
6243 } while (j >= i && Py_UNICODE_ISSPACE(s[j]));
6244 j++;
6245 }
6246
6247 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
6248 Py_INCREF(self);
6249 return (PyObject*)self;
6250 }
6251 else
6252 return PyUnicode_FromUnicode(s+i, j-i);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006253}
6254
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006255
6256static PyObject *
6257do_argstrip(PyUnicodeObject *self, int striptype, PyObject *args)
6258{
6259 PyObject *sep = NULL;
6260
6261 if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
6262 return NULL;
6263
6264 if (sep != NULL && sep != Py_None) {
6265 if (PyUnicode_Check(sep))
6266 return _PyUnicode_XStrip(self, striptype, sep);
6267 else if (PyString_Check(sep)) {
6268 PyObject *res;
6269 sep = PyUnicode_FromObject(sep);
6270 if (sep==NULL)
6271 return NULL;
6272 res = _PyUnicode_XStrip(self, striptype, sep);
6273 Py_DECREF(sep);
6274 return res;
6275 }
6276 else {
6277 PyErr_Format(PyExc_TypeError,
6278 "%s arg must be None, unicode or str",
6279 STRIPNAME(striptype));
6280 return NULL;
6281 }
6282 }
6283
6284 return do_strip(self, striptype);
6285}
6286
6287
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006288PyDoc_STRVAR(strip__doc__,
Neal Norwitzffe33b72003-04-10 22:35:32 +00006289"S.strip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006290\n\
6291Return a copy of the string S with leading and trailing\n\
6292whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00006293If chars is given and not None, remove characters in chars instead.\n\
6294If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006295
6296static PyObject *
6297unicode_strip(PyUnicodeObject *self, PyObject *args)
6298{
6299 if (PyTuple_GET_SIZE(args) == 0)
6300 return do_strip(self, BOTHSTRIP); /* Common case */
6301 else
6302 return do_argstrip(self, BOTHSTRIP, args);
6303}
6304
6305
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006306PyDoc_STRVAR(lstrip__doc__,
Neal Norwitzffe33b72003-04-10 22:35:32 +00006307"S.lstrip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006308\n\
6309Return a copy of the string S with leading whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00006310If chars is given and not None, remove characters in chars instead.\n\
6311If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006312
6313static PyObject *
6314unicode_lstrip(PyUnicodeObject *self, PyObject *args)
6315{
6316 if (PyTuple_GET_SIZE(args) == 0)
6317 return do_strip(self, LEFTSTRIP); /* Common case */
6318 else
6319 return do_argstrip(self, LEFTSTRIP, args);
6320}
6321
6322
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006323PyDoc_STRVAR(rstrip__doc__,
Neal Norwitzffe33b72003-04-10 22:35:32 +00006324"S.rstrip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006325\n\
6326Return a copy of the string S with trailing whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00006327If chars is given and not None, remove characters in chars instead.\n\
6328If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006329
6330static PyObject *
6331unicode_rstrip(PyUnicodeObject *self, PyObject *args)
6332{
6333 if (PyTuple_GET_SIZE(args) == 0)
6334 return do_strip(self, RIGHTSTRIP); /* Common case */
6335 else
6336 return do_argstrip(self, RIGHTSTRIP, args);
6337}
6338
6339
Guido van Rossumd57fd912000-03-10 22:53:23 +00006340static PyObject*
Martin v. Löwis18e16552006-02-15 17:27:45 +00006341unicode_repeat(PyUnicodeObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006342{
6343 PyUnicodeObject *u;
6344 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006345 Py_ssize_t nchars;
Tim Peters8f422462000-09-09 06:13:41 +00006346 size_t nbytes;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006347
6348 if (len < 0)
6349 len = 0;
6350
Tim Peters7a29bd52001-09-12 03:03:31 +00006351 if (len == 1 && PyUnicode_CheckExact(str)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006352 /* no repeat, return original string */
6353 Py_INCREF(str);
6354 return (PyObject*) str;
6355 }
Tim Peters8f422462000-09-09 06:13:41 +00006356
6357 /* ensure # of chars needed doesn't overflow int and # of bytes
6358 * needed doesn't overflow size_t
6359 */
6360 nchars = len * str->length;
6361 if (len && nchars / len != str->length) {
6362 PyErr_SetString(PyExc_OverflowError,
6363 "repeated string is too long");
6364 return NULL;
6365 }
6366 nbytes = (nchars + 1) * sizeof(Py_UNICODE);
6367 if (nbytes / sizeof(Py_UNICODE) != (size_t)(nchars + 1)) {
6368 PyErr_SetString(PyExc_OverflowError,
6369 "repeated string is too long");
6370 return NULL;
6371 }
6372 u = _PyUnicode_New(nchars);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006373 if (!u)
6374 return NULL;
6375
6376 p = u->str;
6377
Thomas Wouters477c8d52006-05-27 19:21:47 +00006378 if (str->length == 1 && len > 0) {
6379 Py_UNICODE_FILL(p, str->str[0], len);
6380 } else {
6381 Py_ssize_t done = 0; /* number of characters copied this far */
6382 if (done < nchars) {
6383 Py_UNICODE_COPY(p, str->str, str->length);
6384 done = str->length;
6385 }
6386 while (done < nchars) {
6387 int n = (done <= nchars-done) ? done : nchars-done;
6388 Py_UNICODE_COPY(p+done, p, n);
6389 done += n;
6390 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006391 }
6392
6393 return (PyObject*) u;
6394}
6395
6396PyObject *PyUnicode_Replace(PyObject *obj,
6397 PyObject *subobj,
6398 PyObject *replobj,
Martin v. Löwis18e16552006-02-15 17:27:45 +00006399 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006400{
6401 PyObject *self;
6402 PyObject *str1;
6403 PyObject *str2;
6404 PyObject *result;
6405
6406 self = PyUnicode_FromObject(obj);
6407 if (self == NULL)
6408 return NULL;
6409 str1 = PyUnicode_FromObject(subobj);
6410 if (str1 == NULL) {
6411 Py_DECREF(self);
6412 return NULL;
6413 }
6414 str2 = PyUnicode_FromObject(replobj);
6415 if (str2 == NULL) {
6416 Py_DECREF(self);
6417 Py_DECREF(str1);
6418 return NULL;
6419 }
Tim Petersced69f82003-09-16 20:30:58 +00006420 result = replace((PyUnicodeObject *)self,
6421 (PyUnicodeObject *)str1,
6422 (PyUnicodeObject *)str2,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006423 maxcount);
6424 Py_DECREF(self);
6425 Py_DECREF(str1);
6426 Py_DECREF(str2);
6427 return result;
6428}
6429
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006430PyDoc_STRVAR(replace__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006431"S.replace (old, new[, maxsplit]) -> unicode\n\
6432\n\
6433Return a copy of S with all occurrences of substring\n\
6434old replaced by new. If the optional argument maxsplit is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006435given, only the first maxsplit occurrences are replaced.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006436
6437static PyObject*
6438unicode_replace(PyUnicodeObject *self, PyObject *args)
6439{
6440 PyUnicodeObject *str1;
6441 PyUnicodeObject *str2;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006442 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006443 PyObject *result;
6444
Martin v. Löwis18e16552006-02-15 17:27:45 +00006445 if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006446 return NULL;
6447 str1 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str1);
6448 if (str1 == NULL)
6449 return NULL;
6450 str2 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str2);
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00006451 if (str2 == NULL) {
6452 Py_DECREF(str1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006453 return NULL;
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00006454 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006455
6456 result = replace(self, str1, str2, maxcount);
6457
6458 Py_DECREF(str1);
6459 Py_DECREF(str2);
6460 return result;
6461}
6462
6463static
6464PyObject *unicode_repr(PyObject *unicode)
6465{
6466 return unicodeescape_string(PyUnicode_AS_UNICODE(unicode),
6467 PyUnicode_GET_SIZE(unicode),
6468 1);
6469}
6470
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006471PyDoc_STRVAR(rfind__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006472"S.rfind(sub [,start [,end]]) -> int\n\
6473\n\
6474Return the highest index in S where substring sub is found,\n\
6475such that sub is contained within s[start,end]. Optional\n\
6476arguments start and end are interpreted as in slice notation.\n\
6477\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006478Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006479
6480static PyObject *
6481unicode_rfind(PyUnicodeObject *self, PyObject *args)
6482{
Thomas Wouters477c8d52006-05-27 19:21:47 +00006483 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006484 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006485 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006486 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006487
Guido van Rossumb8872e62000-05-09 14:14:27 +00006488 if (!PyArg_ParseTuple(args, "O|O&O&:rfind", &substring,
6489 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006490 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006491 substring = PyUnicode_FromObject(substring);
6492 if (!substring)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006493 return NULL;
6494
Thomas Wouters477c8d52006-05-27 19:21:47 +00006495 result = stringlib_rfind_slice(
6496 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
6497 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
6498 start, end
6499 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006500
6501 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00006502
6503 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006504}
6505
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006506PyDoc_STRVAR(rindex__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006507"S.rindex(sub [,start [,end]]) -> int\n\
6508\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006509Like S.rfind() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006510
6511static PyObject *
6512unicode_rindex(PyUnicodeObject *self, PyObject *args)
6513{
Thomas Wouters477c8d52006-05-27 19:21:47 +00006514 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006515 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006516 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006517 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006518
Guido van Rossumb8872e62000-05-09 14:14:27 +00006519 if (!PyArg_ParseTuple(args, "O|O&O&:rindex", &substring,
6520 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006521 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006522 substring = PyUnicode_FromObject(substring);
6523 if (!substring)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006524 return NULL;
6525
Thomas Wouters477c8d52006-05-27 19:21:47 +00006526 result = stringlib_rfind_slice(
6527 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
6528 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
6529 start, end
6530 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006531
6532 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00006533
Guido van Rossumd57fd912000-03-10 22:53:23 +00006534 if (result < 0) {
6535 PyErr_SetString(PyExc_ValueError, "substring not found");
6536 return NULL;
6537 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00006538 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006539}
6540
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006541PyDoc_STRVAR(rjust__doc__,
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006542"S.rjust(width[, fillchar]) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006543\n\
6544Return S right justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006545done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006546
6547static PyObject *
6548unicode_rjust(PyUnicodeObject *self, PyObject *args)
6549{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006550 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006551 Py_UNICODE fillchar = ' ';
6552
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006553 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006554 return NULL;
6555
Tim Peters7a29bd52001-09-12 03:03:31 +00006556 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006557 Py_INCREF(self);
6558 return (PyObject*) self;
6559 }
6560
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006561 return (PyObject*) pad(self, width - self->length, 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006562}
6563
Guido van Rossumd57fd912000-03-10 22:53:23 +00006564static PyObject*
Martin v. Löwis18e16552006-02-15 17:27:45 +00006565unicode_slice(PyUnicodeObject *self, Py_ssize_t start, Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006566{
6567 /* standard clamping */
6568 if (start < 0)
6569 start = 0;
6570 if (end < 0)
6571 end = 0;
6572 if (end > self->length)
6573 end = self->length;
Tim Peters7a29bd52001-09-12 03:03:31 +00006574 if (start == 0 && end == self->length && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006575 /* full slice, return original string */
6576 Py_INCREF(self);
6577 return (PyObject*) self;
6578 }
6579 if (start > end)
6580 start = end;
6581 /* copy slice */
6582 return (PyObject*) PyUnicode_FromUnicode(self->str + start,
6583 end - start);
6584}
6585
6586PyObject *PyUnicode_Split(PyObject *s,
6587 PyObject *sep,
Martin v. Löwis18e16552006-02-15 17:27:45 +00006588 Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006589{
6590 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00006591
Guido van Rossumd57fd912000-03-10 22:53:23 +00006592 s = PyUnicode_FromObject(s);
6593 if (s == NULL)
6594 return NULL;
6595 if (sep != NULL) {
6596 sep = PyUnicode_FromObject(sep);
6597 if (sep == NULL) {
6598 Py_DECREF(s);
6599 return NULL;
6600 }
6601 }
6602
6603 result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
6604
6605 Py_DECREF(s);
6606 Py_XDECREF(sep);
6607 return result;
6608}
6609
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006610PyDoc_STRVAR(split__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006611"S.split([sep [,maxsplit]]) -> list of strings\n\
6612\n\
6613Return a list of the words in S, using sep as the\n\
6614delimiter string. If maxsplit is given, at most maxsplit\n\
Thomas Hellerca0d2cb2004-09-15 11:41:32 +00006615splits are done. If sep is not specified or is None,\n\
Walter Dörwald782afc52004-09-14 09:40:45 +00006616any whitespace string is a separator.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006617
6618static PyObject*
6619unicode_split(PyUnicodeObject *self, PyObject *args)
6620{
6621 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006622 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006623
Martin v. Löwis18e16552006-02-15 17:27:45 +00006624 if (!PyArg_ParseTuple(args, "|On:split", &substring, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006625 return NULL;
6626
6627 if (substring == Py_None)
6628 return split(self, NULL, maxcount);
6629 else if (PyUnicode_Check(substring))
6630 return split(self, (PyUnicodeObject *)substring, maxcount);
6631 else
6632 return PyUnicode_Split((PyObject *)self, substring, maxcount);
6633}
6634
Thomas Wouters477c8d52006-05-27 19:21:47 +00006635PyObject *
6636PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
6637{
6638 PyObject* str_obj;
6639 PyObject* sep_obj;
6640 PyObject* out;
6641
6642 str_obj = PyUnicode_FromObject(str_in);
6643 if (!str_obj)
6644 return NULL;
6645 sep_obj = PyUnicode_FromObject(sep_in);
6646 if (!sep_obj) {
6647 Py_DECREF(str_obj);
6648 return NULL;
6649 }
6650
6651 out = stringlib_partition(
6652 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
6653 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
6654 );
6655
6656 Py_DECREF(sep_obj);
6657 Py_DECREF(str_obj);
6658
6659 return out;
6660}
6661
6662
6663PyObject *
6664PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
6665{
6666 PyObject* str_obj;
6667 PyObject* sep_obj;
6668 PyObject* out;
6669
6670 str_obj = PyUnicode_FromObject(str_in);
6671 if (!str_obj)
6672 return NULL;
6673 sep_obj = PyUnicode_FromObject(sep_in);
6674 if (!sep_obj) {
6675 Py_DECREF(str_obj);
6676 return NULL;
6677 }
6678
6679 out = stringlib_rpartition(
6680 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
6681 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
6682 );
6683
6684 Py_DECREF(sep_obj);
6685 Py_DECREF(str_obj);
6686
6687 return out;
6688}
6689
6690PyDoc_STRVAR(partition__doc__,
6691"S.partition(sep) -> (head, sep, tail)\n\
6692\n\
6693Searches for the separator sep in S, and returns the part before it,\n\
6694the separator itself, and the part after it. If the separator is not\n\
6695found, returns S and two empty strings.");
6696
6697static PyObject*
6698unicode_partition(PyUnicodeObject *self, PyObject *separator)
6699{
6700 return PyUnicode_Partition((PyObject *)self, separator);
6701}
6702
6703PyDoc_STRVAR(rpartition__doc__,
Thomas Wouters89f507f2006-12-13 04:49:30 +00006704"S.rpartition(sep) -> (tail, sep, head)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00006705\n\
6706Searches for the separator sep in S, starting at the end of S, and returns\n\
6707the part before it, the separator itself, and the part after it. If the\n\
Thomas Wouters89f507f2006-12-13 04:49:30 +00006708separator is not found, returns two empty strings and S.");
Thomas Wouters477c8d52006-05-27 19:21:47 +00006709
6710static PyObject*
6711unicode_rpartition(PyUnicodeObject *self, PyObject *separator)
6712{
6713 return PyUnicode_RPartition((PyObject *)self, separator);
6714}
6715
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006716PyObject *PyUnicode_RSplit(PyObject *s,
6717 PyObject *sep,
Martin v. Löwis18e16552006-02-15 17:27:45 +00006718 Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006719{
6720 PyObject *result;
6721
6722 s = PyUnicode_FromObject(s);
6723 if (s == NULL)
6724 return NULL;
6725 if (sep != NULL) {
6726 sep = PyUnicode_FromObject(sep);
6727 if (sep == NULL) {
6728 Py_DECREF(s);
6729 return NULL;
6730 }
6731 }
6732
6733 result = rsplit((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
6734
6735 Py_DECREF(s);
6736 Py_XDECREF(sep);
6737 return result;
6738}
6739
6740PyDoc_STRVAR(rsplit__doc__,
6741"S.rsplit([sep [,maxsplit]]) -> list of strings\n\
6742\n\
6743Return a list of the words in S, using sep as the\n\
6744delimiter string, starting at the end of the string and\n\
6745working to the front. If maxsplit is given, at most maxsplit\n\
6746splits are done. If sep is not specified, any whitespace string\n\
6747is a separator.");
6748
6749static PyObject*
6750unicode_rsplit(PyUnicodeObject *self, PyObject *args)
6751{
6752 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006753 Py_ssize_t maxcount = -1;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006754
Martin v. Löwis18e16552006-02-15 17:27:45 +00006755 if (!PyArg_ParseTuple(args, "|On:rsplit", &substring, &maxcount))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006756 return NULL;
6757
6758 if (substring == Py_None)
6759 return rsplit(self, NULL, maxcount);
6760 else if (PyUnicode_Check(substring))
6761 return rsplit(self, (PyUnicodeObject *)substring, maxcount);
6762 else
6763 return PyUnicode_RSplit((PyObject *)self, substring, maxcount);
6764}
6765
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006766PyDoc_STRVAR(splitlines__doc__,
Guido van Rossum86662912000-04-11 15:38:46 +00006767"S.splitlines([keepends]]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006768\n\
6769Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +00006770Line breaks are not included in the resulting list unless keepends\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006771is given and true.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006772
6773static PyObject*
6774unicode_splitlines(PyUnicodeObject *self, PyObject *args)
6775{
Guido van Rossum86662912000-04-11 15:38:46 +00006776 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006777
Guido van Rossum86662912000-04-11 15:38:46 +00006778 if (!PyArg_ParseTuple(args, "|i:splitlines", &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006779 return NULL;
6780
Guido van Rossum86662912000-04-11 15:38:46 +00006781 return PyUnicode_Splitlines((PyObject *)self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006782}
6783
6784static
Guido van Rossumf15a29f2007-05-04 00:41:39 +00006785PyObject *unicode_str(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006786{
Guido van Rossumf15a29f2007-05-04 00:41:39 +00006787 PyObject *res = _PyUnicode_AsDefaultEncodedString(self, NULL);
6788 Py_XINCREF(res);
6789 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006790}
6791
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006792PyDoc_STRVAR(swapcase__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006793"S.swapcase() -> unicode\n\
6794\n\
6795Return a copy of S with uppercase characters converted to lowercase\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006796and vice versa.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006797
6798static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006799unicode_swapcase(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006800{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006801 return fixup(self, fixswapcase);
6802}
6803
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006804PyDoc_STRVAR(translate__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006805"S.translate(table) -> unicode\n\
6806\n\
6807Return a copy of the string S, where all characters have been mapped\n\
6808through the given translation table, which must be a mapping of\n\
Walter Dörwald5c1ee172002-09-04 20:31:32 +00006809Unicode ordinals to Unicode ordinals, Unicode strings or None.\n\
6810Unmapped characters are left untouched. Characters mapped to None\n\
6811are deleted.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006812
6813static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006814unicode_translate(PyUnicodeObject *self, PyObject *table)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006815{
Tim Petersced69f82003-09-16 20:30:58 +00006816 return PyUnicode_TranslateCharmap(self->str,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006817 self->length,
Tim Petersced69f82003-09-16 20:30:58 +00006818 table,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006819 "ignore");
6820}
6821
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006822PyDoc_STRVAR(upper__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006823"S.upper() -> unicode\n\
6824\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006825Return a copy of S converted to uppercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006826
6827static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006828unicode_upper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006829{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006830 return fixup(self, fixupper);
6831}
6832
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006833PyDoc_STRVAR(zfill__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006834"S.zfill(width) -> unicode\n\
6835\n\
6836Pad a numeric string x with zeros on the left, to fill a field\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006837of the specified width. The string x is never truncated.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006838
6839static PyObject *
6840unicode_zfill(PyUnicodeObject *self, PyObject *args)
6841{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006842 Py_ssize_t fill;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006843 PyUnicodeObject *u;
6844
Martin v. Löwis18e16552006-02-15 17:27:45 +00006845 Py_ssize_t width;
6846 if (!PyArg_ParseTuple(args, "n:zfill", &width))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006847 return NULL;
6848
6849 if (self->length >= width) {
Walter Dörwald0fe940c2002-04-15 18:42:15 +00006850 if (PyUnicode_CheckExact(self)) {
6851 Py_INCREF(self);
6852 return (PyObject*) self;
6853 }
6854 else
6855 return PyUnicode_FromUnicode(
6856 PyUnicode_AS_UNICODE(self),
6857 PyUnicode_GET_SIZE(self)
6858 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006859 }
6860
6861 fill = width - self->length;
6862
6863 u = pad(self, fill, 0, '0');
6864
Walter Dörwald068325e2002-04-15 13:36:47 +00006865 if (u == NULL)
6866 return NULL;
6867
Guido van Rossumd57fd912000-03-10 22:53:23 +00006868 if (u->str[fill] == '+' || u->str[fill] == '-') {
6869 /* move sign to beginning of string */
6870 u->str[0] = u->str[fill];
6871 u->str[fill] = '0';
6872 }
6873
6874 return (PyObject*) u;
6875}
Guido van Rossumd57fd912000-03-10 22:53:23 +00006876
6877#if 0
6878static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006879unicode_freelistsize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006880{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006881 return PyInt_FromLong(unicode_freelist_size);
6882}
6883#endif
6884
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006885PyDoc_STRVAR(startswith__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006886"S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006887\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00006888Return True if S starts with the specified prefix, False otherwise.\n\
6889With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006890With optional end, stop comparing S at that position.\n\
6891prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006892
6893static PyObject *
6894unicode_startswith(PyUnicodeObject *self,
6895 PyObject *args)
6896{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006897 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006898 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006899 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006900 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006901 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006902
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006903 if (!PyArg_ParseTuple(args, "O|O&O&:startswith", &subobj,
Guido van Rossumb8872e62000-05-09 14:14:27 +00006904 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006905 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006906 if (PyTuple_Check(subobj)) {
6907 Py_ssize_t i;
6908 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
6909 substring = (PyUnicodeObject *)PyUnicode_FromObject(
6910 PyTuple_GET_ITEM(subobj, i));
6911 if (substring == NULL)
6912 return NULL;
6913 result = tailmatch(self, substring, start, end, -1);
6914 Py_DECREF(substring);
6915 if (result) {
6916 Py_RETURN_TRUE;
6917 }
6918 }
6919 /* nothing matched */
6920 Py_RETURN_FALSE;
6921 }
6922 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006923 if (substring == NULL)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006924 return NULL;
6925 result = tailmatch(self, substring, start, end, -1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006926 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006927 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006928}
6929
6930
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006931PyDoc_STRVAR(endswith__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006932"S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006933\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00006934Return True if S ends with the specified suffix, False otherwise.\n\
6935With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006936With optional end, stop comparing S at that position.\n\
6937suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006938
6939static PyObject *
6940unicode_endswith(PyUnicodeObject *self,
6941 PyObject *args)
6942{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006943 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006944 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006945 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006946 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006947 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006948
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006949 if (!PyArg_ParseTuple(args, "O|O&O&:endswith", &subobj,
6950 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006951 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006952 if (PyTuple_Check(subobj)) {
6953 Py_ssize_t i;
6954 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
6955 substring = (PyUnicodeObject *)PyUnicode_FromObject(
6956 PyTuple_GET_ITEM(subobj, i));
6957 if (substring == NULL)
6958 return NULL;
6959 result = tailmatch(self, substring, start, end, +1);
6960 Py_DECREF(substring);
6961 if (result) {
6962 Py_RETURN_TRUE;
6963 }
6964 }
6965 Py_RETURN_FALSE;
6966 }
6967 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006968 if (substring == NULL)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006969 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006970
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006971 result = tailmatch(self, substring, start, end, +1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006972 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006973 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006974}
6975
6976
Guido van Rossum5d9113d2003-01-29 17:58:45 +00006977
6978static PyObject *
6979unicode_getnewargs(PyUnicodeObject *v)
6980{
6981 return Py_BuildValue("(u#)", v->str, v->length);
6982}
6983
6984
Guido van Rossumd57fd912000-03-10 22:53:23 +00006985static PyMethodDef unicode_methods[] = {
6986
6987 /* Order is according to common usage: often used methods should
6988 appear first, since lookup is done sequentially. */
6989
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006990 {"encode", (PyCFunction) unicode_encode, METH_VARARGS, encode__doc__},
6991 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
6992 {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006993 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006994 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
6995 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
6996 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
6997 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
6998 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
6999 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
7000 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +00007001 {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007002 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
7003 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
7004 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007005 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00007006 {"decode", (PyCFunction) unicode_decode, METH_VARARGS, decode__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007007/* {"maketrans", (PyCFunction) unicode_maketrans, METH_VARARGS, maketrans__doc__}, */
7008 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
7009 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
7010 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007011 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +00007012 {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007013 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS, splitlines__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007014 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007015 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
7016 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
7017 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
7018 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
7019 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
7020 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
7021 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
7022 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
7023 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
7024 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
7025 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
7026 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
7027 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
7028 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007029 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
Walter Dörwald068325e2002-04-15 13:36:47 +00007030#if 0
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007031 {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
Guido van Rossumd57fd912000-03-10 22:53:23 +00007032#endif
7033
7034#if 0
7035 /* This one is just used for debugging the implementation. */
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007036 {"freelistsize", (PyCFunction) unicode_freelistsize, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00007037#endif
7038
Guido van Rossum5d9113d2003-01-29 17:58:45 +00007039 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00007040 {NULL, NULL}
7041};
7042
Neil Schemenauerce30bc92002-11-18 16:10:18 +00007043static PyObject *
7044unicode_mod(PyObject *v, PyObject *w)
7045{
7046 if (!PyUnicode_Check(v)) {
7047 Py_INCREF(Py_NotImplemented);
7048 return Py_NotImplemented;
7049 }
7050 return PyUnicode_Format(v, w);
7051}
7052
7053static PyNumberMethods unicode_as_number = {
7054 0, /*nb_add*/
7055 0, /*nb_subtract*/
7056 0, /*nb_multiply*/
Neil Schemenauerce30bc92002-11-18 16:10:18 +00007057 unicode_mod, /*nb_remainder*/
7058};
7059
Guido van Rossumd57fd912000-03-10 22:53:23 +00007060static PySequenceMethods unicode_as_sequence = {
Martin v. Löwis18e16552006-02-15 17:27:45 +00007061 (lenfunc) unicode_length, /* sq_length */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007062 PyUnicode_Concat, /* sq_concat */
Martin v. Löwis18e16552006-02-15 17:27:45 +00007063 (ssizeargfunc) unicode_repeat, /* sq_repeat */
7064 (ssizeargfunc) unicode_getitem, /* sq_item */
7065 (ssizessizeargfunc) unicode_slice, /* sq_slice */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007066 0, /* sq_ass_item */
7067 0, /* sq_ass_slice */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007068 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007069};
7070
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007071static PyObject*
7072unicode_subscript(PyUnicodeObject* self, PyObject* item)
7073{
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00007074 if (PyIndex_Check(item)) {
7075 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007076 if (i == -1 && PyErr_Occurred())
7077 return NULL;
7078 if (i < 0)
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007079 i += PyUnicode_GET_SIZE(self);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007080 return unicode_getitem(self, i);
7081 } else if (PySlice_Check(item)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00007082 Py_ssize_t start, stop, step, slicelength, cur, i;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007083 Py_UNICODE* source_buf;
7084 Py_UNICODE* result_buf;
7085 PyObject* result;
7086
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007087 if (PySlice_GetIndicesEx((PySliceObject*)item, PyUnicode_GET_SIZE(self),
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007088 &start, &stop, &step, &slicelength) < 0) {
7089 return NULL;
7090 }
7091
7092 if (slicelength <= 0) {
7093 return PyUnicode_FromUnicode(NULL, 0);
7094 } else {
7095 source_buf = PyUnicode_AS_UNICODE((PyObject*)self);
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007096 result_buf = (Py_UNICODE *)PyMem_MALLOC(slicelength*
7097 sizeof(Py_UNICODE));
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007098
7099 if (result_buf == NULL)
7100 return PyErr_NoMemory();
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007101
7102 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
7103 result_buf[i] = source_buf[cur];
7104 }
Tim Petersced69f82003-09-16 20:30:58 +00007105
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007106 result = PyUnicode_FromUnicode(result_buf, slicelength);
7107 PyMem_FREE(result_buf);
7108 return result;
7109 }
7110 } else {
7111 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
7112 return NULL;
7113 }
7114}
7115
7116static PyMappingMethods unicode_as_mapping = {
Martin v. Löwis18e16552006-02-15 17:27:45 +00007117 (lenfunc)unicode_length, /* mp_length */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007118 (binaryfunc)unicode_subscript, /* mp_subscript */
7119 (objobjargproc)0, /* mp_ass_subscript */
7120};
7121
Martin v. Löwis18e16552006-02-15 17:27:45 +00007122static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +00007123unicode_buffer_getreadbuf(PyUnicodeObject *self,
Martin v. Löwis18e16552006-02-15 17:27:45 +00007124 Py_ssize_t index,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007125 const void **ptr)
7126{
7127 if (index != 0) {
7128 PyErr_SetString(PyExc_SystemError,
7129 "accessing non-existent unicode segment");
7130 return -1;
7131 }
7132 *ptr = (void *) self->str;
7133 return PyUnicode_GET_DATA_SIZE(self);
7134}
7135
Martin v. Löwis18e16552006-02-15 17:27:45 +00007136static Py_ssize_t
7137unicode_buffer_getwritebuf(PyUnicodeObject *self, Py_ssize_t index,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007138 const void **ptr)
7139{
7140 PyErr_SetString(PyExc_TypeError,
Neal Norwitz20e72132002-06-13 21:25:17 +00007141 "cannot use unicode as modifiable buffer");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007142 return -1;
7143}
7144
7145static int
7146unicode_buffer_getsegcount(PyUnicodeObject *self,
Martin v. Löwis18e16552006-02-15 17:27:45 +00007147 Py_ssize_t *lenp)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007148{
7149 if (lenp)
7150 *lenp = PyUnicode_GET_DATA_SIZE(self);
7151 return 1;
7152}
7153
Martin v. Löwiseb079f12006-02-16 14:32:27 +00007154static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +00007155unicode_buffer_getcharbuf(PyUnicodeObject *self,
Martin v. Löwis18e16552006-02-15 17:27:45 +00007156 Py_ssize_t index,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007157 const void **ptr)
7158{
7159 PyObject *str;
Tim Petersced69f82003-09-16 20:30:58 +00007160
Guido van Rossumd57fd912000-03-10 22:53:23 +00007161 if (index != 0) {
7162 PyErr_SetString(PyExc_SystemError,
7163 "accessing non-existent unicode segment");
7164 return -1;
7165 }
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00007166 str = _PyUnicode_AsDefaultEncodedString((PyObject *)self, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007167 if (str == NULL)
7168 return -1;
7169 *ptr = (void *) PyString_AS_STRING(str);
7170 return PyString_GET_SIZE(str);
7171}
7172
7173/* Helpers for PyUnicode_Format() */
7174
7175static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +00007176getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007177{
Martin v. Löwis18e16552006-02-15 17:27:45 +00007178 Py_ssize_t argidx = *p_argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007179 if (argidx < arglen) {
7180 (*p_argidx)++;
7181 if (arglen < 0)
7182 return args;
7183 else
7184 return PyTuple_GetItem(args, argidx);
7185 }
7186 PyErr_SetString(PyExc_TypeError,
7187 "not enough arguments for format string");
7188 return NULL;
7189}
7190
7191#define F_LJUST (1<<0)
7192#define F_SIGN (1<<1)
7193#define F_BLANK (1<<2)
7194#define F_ALT (1<<3)
7195#define F_ZERO (1<<4)
7196
Martin v. Löwis18e16552006-02-15 17:27:45 +00007197static Py_ssize_t
Neal Norwitzfc76d632006-01-10 06:03:13 +00007198strtounicode(Py_UNICODE *buffer, const char *charbuffer)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007199{
Martin v. Löwis18e16552006-02-15 17:27:45 +00007200 register Py_ssize_t i;
7201 Py_ssize_t len = strlen(charbuffer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007202 for (i = len - 1; i >= 0; i--)
7203 buffer[i] = (Py_UNICODE) charbuffer[i];
7204
Guido van Rossumd57fd912000-03-10 22:53:23 +00007205 return len;
7206}
7207
Neal Norwitzfc76d632006-01-10 06:03:13 +00007208static int
7209doubletounicode(Py_UNICODE *buffer, size_t len, const char *format, double x)
7210{
Tim Peters15231542006-02-16 01:08:01 +00007211 Py_ssize_t result;
7212
Neal Norwitzfc76d632006-01-10 06:03:13 +00007213 PyOS_ascii_formatd((char *)buffer, len, format, x);
Tim Peters15231542006-02-16 01:08:01 +00007214 result = strtounicode(buffer, (char *)buffer);
7215 return Py_SAFE_DOWNCAST(result, Py_ssize_t, int);
Neal Norwitzfc76d632006-01-10 06:03:13 +00007216}
7217
7218static int
7219longtounicode(Py_UNICODE *buffer, size_t len, const char *format, long x)
7220{
Tim Peters15231542006-02-16 01:08:01 +00007221 Py_ssize_t result;
7222
Neal Norwitzfc76d632006-01-10 06:03:13 +00007223 PyOS_snprintf((char *)buffer, len, format, x);
Tim Peters15231542006-02-16 01:08:01 +00007224 result = strtounicode(buffer, (char *)buffer);
7225 return Py_SAFE_DOWNCAST(result, Py_ssize_t, int);
Neal Norwitzfc76d632006-01-10 06:03:13 +00007226}
7227
Guido van Rossum078151d2002-08-11 04:24:12 +00007228/* XXX To save some code duplication, formatfloat/long/int could have been
7229 shared with stringobject.c, converting from 8-bit to Unicode after the
7230 formatting is done. */
7231
Guido van Rossumd57fd912000-03-10 22:53:23 +00007232static int
7233formatfloat(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007234 size_t buflen,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007235 int flags,
7236 int prec,
7237 int type,
7238 PyObject *v)
7239{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007240 /* fmt = '%#.' + `prec` + `type`
7241 worst case length = 3 + 10 (len of INT_MAX) + 1 = 14 (use 20)*/
Guido van Rossumd57fd912000-03-10 22:53:23 +00007242 char fmt[20];
7243 double x;
Tim Petersced69f82003-09-16 20:30:58 +00007244
Guido van Rossumd57fd912000-03-10 22:53:23 +00007245 x = PyFloat_AsDouble(v);
7246 if (x == -1.0 && PyErr_Occurred())
7247 return -1;
7248 if (prec < 0)
7249 prec = 6;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007250 if (type == 'f' && (fabs(x) / 1e25) >= 1e25)
7251 type = 'g';
Marc-André Lemburg79f57832002-12-29 19:44:06 +00007252 /* Worst case length calc to ensure no buffer overrun:
7253
7254 'g' formats:
7255 fmt = %#.<prec>g
7256 buf = '-' + [0-9]*prec + '.' + 'e+' + (longest exp
7257 for any double rep.)
7258 len = 1 + prec + 1 + 2 + 5 = 9 + prec
7259
7260 'f' formats:
7261 buf = '-' + [0-9]*x + '.' + [0-9]*prec (with x < 50)
7262 len = 1 + 50 + 1 + prec = 52 + prec
7263
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007264 If prec=0 the effective precision is 1 (the leading digit is
Tim Petersced69f82003-09-16 20:30:58 +00007265 always given), therefore increase the length by one.
Marc-André Lemburg79f57832002-12-29 19:44:06 +00007266
7267 */
7268 if ((type == 'g' && buflen <= (size_t)10 + (size_t)prec) ||
7269 (type == 'f' && buflen <= (size_t)53 + (size_t)prec)) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007270 PyErr_SetString(PyExc_OverflowError,
Marc-André Lemburg79f57832002-12-29 19:44:06 +00007271 "formatted float is too long (precision too large?)");
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007272 return -1;
7273 }
Marc-André Lemburg79f57832002-12-29 19:44:06 +00007274 PyOS_snprintf(fmt, sizeof(fmt), "%%%s.%d%c",
7275 (flags&F_ALT) ? "#" : "",
7276 prec, type);
Neal Norwitzfc76d632006-01-10 06:03:13 +00007277 return doubletounicode(buf, buflen, fmt, x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007278}
7279
Tim Peters38fd5b62000-09-21 05:43:11 +00007280static PyObject*
7281formatlong(PyObject *val, int flags, int prec, int type)
7282{
7283 char *buf;
7284 int i, len;
7285 PyObject *str; /* temporary string object. */
7286 PyUnicodeObject *result;
7287
7288 str = _PyString_FormatLong(val, flags, prec, type, &buf, &len);
7289 if (!str)
7290 return NULL;
7291 result = _PyUnicode_New(len);
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00007292 if (!result) {
7293 Py_DECREF(str);
7294 return NULL;
7295 }
Tim Peters38fd5b62000-09-21 05:43:11 +00007296 for (i = 0; i < len; i++)
7297 result->str[i] = buf[i];
7298 result->str[len] = 0;
7299 Py_DECREF(str);
7300 return (PyObject*)result;
7301}
7302
Guido van Rossumd57fd912000-03-10 22:53:23 +00007303static int
7304formatint(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007305 size_t buflen,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007306 int flags,
7307 int prec,
7308 int type,
7309 PyObject *v)
7310{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007311 /* fmt = '%#.' + `prec` + 'l' + `type`
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00007312 * worst case length = 3 + 19 (worst len of INT_MAX on 64-bit machine)
7313 * + 1 + 1
7314 * = 24
7315 */
Tim Peters38fd5b62000-09-21 05:43:11 +00007316 char fmt[64]; /* plenty big enough! */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00007317 char *sign;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007318 long x;
7319
7320 x = PyInt_AsLong(v);
7321 if (x == -1 && PyErr_Occurred())
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00007322 return -1;
Guido van Rossum6c9e1302003-11-29 23:52:13 +00007323 if (x < 0 && type == 'u') {
7324 type = 'd';
Guido van Rossum078151d2002-08-11 04:24:12 +00007325 }
Guido van Rossum6c9e1302003-11-29 23:52:13 +00007326 if (x < 0 && (type == 'x' || type == 'X' || type == 'o'))
7327 sign = "-";
7328 else
7329 sign = "";
Guido van Rossumd57fd912000-03-10 22:53:23 +00007330 if (prec < 0)
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00007331 prec = 1;
7332
Guido van Rossum6c9e1302003-11-29 23:52:13 +00007333 /* buf = '+'/'-'/'' + '0'/'0x'/'' + '[0-9]'*max(prec, len(x in octal))
7334 * worst case buf = '-0x' + [0-9]*prec, where prec >= 11
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00007335 */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00007336 if (buflen <= 14 || buflen <= (size_t)3 + (size_t)prec) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007337 PyErr_SetString(PyExc_OverflowError,
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00007338 "formatted integer is too long (precision too large?)");
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007339 return -1;
7340 }
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00007341
7342 if ((flags & F_ALT) &&
7343 (type == 'x' || type == 'X')) {
Tim Petersced69f82003-09-16 20:30:58 +00007344 /* When converting under %#x or %#X, there are a number
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00007345 * of issues that cause pain:
7346 * - when 0 is being converted, the C standard leaves off
7347 * the '0x' or '0X', which is inconsistent with other
7348 * %#x/%#X conversions and inconsistent with Python's
7349 * hex() function
7350 * - there are platforms that violate the standard and
7351 * convert 0 with the '0x' or '0X'
7352 * (Metrowerks, Compaq Tru64)
7353 * - there are platforms that give '0x' when converting
Tim Petersced69f82003-09-16 20:30:58 +00007354 * under %#X, but convert 0 in accordance with the
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00007355 * standard (OS/2 EMX)
Tim Petersced69f82003-09-16 20:30:58 +00007356 *
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00007357 * We can achieve the desired consistency by inserting our
7358 * own '0x' or '0X' prefix, and substituting %x/%X in place
7359 * of %#x/%#X.
7360 *
7361 * Note that this is the same approach as used in
7362 * formatint() in stringobject.c
Andrew MacIntyrec4874392002-02-26 11:36:35 +00007363 */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00007364 PyOS_snprintf(fmt, sizeof(fmt), "%s0%c%%.%dl%c",
7365 sign, type, prec, type);
Andrew MacIntyrec4874392002-02-26 11:36:35 +00007366 }
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00007367 else {
Guido van Rossum6c9e1302003-11-29 23:52:13 +00007368 PyOS_snprintf(fmt, sizeof(fmt), "%s%%%s.%dl%c",
7369 sign, (flags&F_ALT) ? "#" : "",
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00007370 prec, type);
Tim Petersb3d8d1f2001-04-28 05:38:26 +00007371 }
Guido van Rossum6c9e1302003-11-29 23:52:13 +00007372 if (sign[0])
Neal Norwitzfc76d632006-01-10 06:03:13 +00007373 return longtounicode(buf, buflen, fmt, -x);
Guido van Rossum6c9e1302003-11-29 23:52:13 +00007374 else
Neal Norwitzfc76d632006-01-10 06:03:13 +00007375 return longtounicode(buf, buflen, fmt, x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007376}
7377
7378static int
7379formatchar(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007380 size_t buflen,
7381 PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007382{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007383 /* presume that the buffer is at least 2 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00007384 if (PyUnicode_Check(v)) {
7385 if (PyUnicode_GET_SIZE(v) != 1)
7386 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007387 buf[0] = PyUnicode_AS_UNICODE(v)[0];
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00007388 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007389
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00007390 else if (PyString_Check(v)) {
Tim Petersced69f82003-09-16 20:30:58 +00007391 if (PyString_GET_SIZE(v) != 1)
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00007392 goto onError;
7393 buf[0] = (Py_UNICODE)PyString_AS_STRING(v)[0];
7394 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007395
7396 else {
7397 /* Integer input truncated to a character */
7398 long x;
7399 x = PyInt_AsLong(v);
7400 if (x == -1 && PyErr_Occurred())
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00007401 goto onError;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00007402#ifdef Py_UNICODE_WIDE
7403 if (x < 0 || x > 0x10ffff) {
Walter Dörwald44f527f2003-04-02 16:37:24 +00007404 PyErr_SetString(PyExc_OverflowError,
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00007405 "%c arg not in range(0x110000) "
7406 "(wide Python build)");
7407 return -1;
7408 }
7409#else
7410 if (x < 0 || x > 0xffff) {
Walter Dörwald44f527f2003-04-02 16:37:24 +00007411 PyErr_SetString(PyExc_OverflowError,
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00007412 "%c arg not in range(0x10000) "
7413 "(narrow Python build)");
7414 return -1;
7415 }
7416#endif
7417 buf[0] = (Py_UNICODE) x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007418 }
7419 buf[1] = '\0';
7420 return 1;
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00007421
7422 onError:
7423 PyErr_SetString(PyExc_TypeError,
7424 "%c requires int or char");
7425 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007426}
7427
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007428/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
7429
7430 FORMATBUFLEN is the length of the buffer in which the floats, ints, &
7431 chars are formatted. XXX This is a magic number. Each formatting
7432 routine does bounds checking to ensure no overflow, but a better
7433 solution may be to malloc a buffer of appropriate size for each
7434 format. For now, the current solution is sufficient.
7435*/
7436#define FORMATBUFLEN (size_t)120
7437
Guido van Rossumd57fd912000-03-10 22:53:23 +00007438PyObject *PyUnicode_Format(PyObject *format,
7439 PyObject *args)
7440{
7441 Py_UNICODE *fmt, *res;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007442 Py_ssize_t fmtcnt, rescnt, reslen, arglen, argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007443 int args_owned = 0;
7444 PyUnicodeObject *result = NULL;
7445 PyObject *dict = NULL;
7446 PyObject *uformat;
Tim Petersced69f82003-09-16 20:30:58 +00007447
Guido van Rossumd57fd912000-03-10 22:53:23 +00007448 if (format == NULL || args == NULL) {
7449 PyErr_BadInternalCall();
7450 return NULL;
7451 }
7452 uformat = PyUnicode_FromObject(format);
Fred Drakee4315f52000-05-09 19:53:39 +00007453 if (uformat == NULL)
7454 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007455 fmt = PyUnicode_AS_UNICODE(uformat);
7456 fmtcnt = PyUnicode_GET_SIZE(uformat);
7457
7458 reslen = rescnt = fmtcnt + 100;
7459 result = _PyUnicode_New(reslen);
7460 if (result == NULL)
7461 goto onError;
7462 res = PyUnicode_AS_UNICODE(result);
7463
7464 if (PyTuple_Check(args)) {
7465 arglen = PyTuple_Size(args);
7466 argidx = 0;
7467 }
7468 else {
7469 arglen = -1;
7470 argidx = -2;
7471 }
Neal Norwitz80a1bf42002-11-12 23:01:12 +00007472 if (args->ob_type->tp_as_mapping && !PyTuple_Check(args) &&
7473 !PyObject_TypeCheck(args, &PyBaseString_Type))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007474 dict = args;
7475
7476 while (--fmtcnt >= 0) {
7477 if (*fmt != '%') {
7478 if (--rescnt < 0) {
7479 rescnt = fmtcnt + 100;
7480 reslen += rescnt;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00007481 if (_PyUnicode_Resize(&result, reslen) < 0)
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00007482 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007483 res = PyUnicode_AS_UNICODE(result) + reslen - rescnt;
7484 --rescnt;
7485 }
7486 *res++ = *fmt++;
7487 }
7488 else {
7489 /* Got a format specifier */
7490 int flags = 0;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007491 Py_ssize_t width = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007492 int prec = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007493 Py_UNICODE c = '\0';
7494 Py_UNICODE fill;
7495 PyObject *v = NULL;
7496 PyObject *temp = NULL;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007497 Py_UNICODE *pbuf;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007498 Py_UNICODE sign;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007499 Py_ssize_t len;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007500 Py_UNICODE formatbuf[FORMATBUFLEN]; /* For format{float,int,char}() */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007501
7502 fmt++;
7503 if (*fmt == '(') {
7504 Py_UNICODE *keystart;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007505 Py_ssize_t keylen;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007506 PyObject *key;
7507 int pcount = 1;
7508
7509 if (dict == NULL) {
7510 PyErr_SetString(PyExc_TypeError,
Tim Petersced69f82003-09-16 20:30:58 +00007511 "format requires a mapping");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007512 goto onError;
7513 }
7514 ++fmt;
7515 --fmtcnt;
7516 keystart = fmt;
7517 /* Skip over balanced parentheses */
7518 while (pcount > 0 && --fmtcnt >= 0) {
7519 if (*fmt == ')')
7520 --pcount;
7521 else if (*fmt == '(')
7522 ++pcount;
7523 fmt++;
7524 }
7525 keylen = fmt - keystart - 1;
7526 if (fmtcnt < 0 || pcount > 0) {
7527 PyErr_SetString(PyExc_ValueError,
7528 "incomplete format key");
7529 goto onError;
7530 }
Marc-André Lemburg72f82132001-11-20 15:18:49 +00007531#if 0
Fred Drakee4315f52000-05-09 19:53:39 +00007532 /* keys are converted to strings using UTF-8 and
Guido van Rossumd57fd912000-03-10 22:53:23 +00007533 then looked up since Python uses strings to hold
7534 variables names etc. in its namespaces and we
Fred Drakee4315f52000-05-09 19:53:39 +00007535 wouldn't want to break common idioms. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007536 key = PyUnicode_EncodeUTF8(keystart,
7537 keylen,
7538 NULL);
Marc-André Lemburg72f82132001-11-20 15:18:49 +00007539#else
7540 key = PyUnicode_FromUnicode(keystart, keylen);
7541#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00007542 if (key == NULL)
7543 goto onError;
7544 if (args_owned) {
7545 Py_DECREF(args);
7546 args_owned = 0;
7547 }
7548 args = PyObject_GetItem(dict, key);
7549 Py_DECREF(key);
7550 if (args == NULL) {
7551 goto onError;
7552 }
7553 args_owned = 1;
7554 arglen = -1;
7555 argidx = -2;
7556 }
7557 while (--fmtcnt >= 0) {
7558 switch (c = *fmt++) {
7559 case '-': flags |= F_LJUST; continue;
7560 case '+': flags |= F_SIGN; continue;
7561 case ' ': flags |= F_BLANK; continue;
7562 case '#': flags |= F_ALT; continue;
7563 case '0': flags |= F_ZERO; continue;
7564 }
7565 break;
7566 }
7567 if (c == '*') {
7568 v = getnextarg(args, arglen, &argidx);
7569 if (v == NULL)
7570 goto onError;
7571 if (!PyInt_Check(v)) {
7572 PyErr_SetString(PyExc_TypeError,
7573 "* wants int");
7574 goto onError;
7575 }
7576 width = PyInt_AsLong(v);
Guido van Rossumddefaf32007-01-14 03:31:43 +00007577 if (width == -1 && PyErr_Occurred())
7578 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007579 if (width < 0) {
7580 flags |= F_LJUST;
7581 width = -width;
7582 }
7583 if (--fmtcnt >= 0)
7584 c = *fmt++;
7585 }
7586 else if (c >= '0' && c <= '9') {
7587 width = c - '0';
7588 while (--fmtcnt >= 0) {
7589 c = *fmt++;
7590 if (c < '0' || c > '9')
7591 break;
7592 if ((width*10) / 10 != width) {
7593 PyErr_SetString(PyExc_ValueError,
7594 "width too big");
7595 goto onError;
7596 }
7597 width = width*10 + (c - '0');
7598 }
7599 }
7600 if (c == '.') {
7601 prec = 0;
7602 if (--fmtcnt >= 0)
7603 c = *fmt++;
7604 if (c == '*') {
7605 v = getnextarg(args, arglen, &argidx);
7606 if (v == NULL)
7607 goto onError;
7608 if (!PyInt_Check(v)) {
7609 PyErr_SetString(PyExc_TypeError,
7610 "* wants int");
7611 goto onError;
7612 }
7613 prec = PyInt_AsLong(v);
Guido van Rossumddefaf32007-01-14 03:31:43 +00007614 if (prec == -1 && PyErr_Occurred())
7615 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007616 if (prec < 0)
7617 prec = 0;
7618 if (--fmtcnt >= 0)
7619 c = *fmt++;
7620 }
7621 else if (c >= '0' && c <= '9') {
7622 prec = c - '0';
7623 while (--fmtcnt >= 0) {
7624 c = Py_CHARMASK(*fmt++);
7625 if (c < '0' || c > '9')
7626 break;
7627 if ((prec*10) / 10 != prec) {
7628 PyErr_SetString(PyExc_ValueError,
7629 "prec too big");
7630 goto onError;
7631 }
7632 prec = prec*10 + (c - '0');
7633 }
7634 }
7635 } /* prec */
7636 if (fmtcnt >= 0) {
7637 if (c == 'h' || c == 'l' || c == 'L') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007638 if (--fmtcnt >= 0)
7639 c = *fmt++;
7640 }
7641 }
7642 if (fmtcnt < 0) {
7643 PyErr_SetString(PyExc_ValueError,
7644 "incomplete format");
7645 goto onError;
7646 }
7647 if (c != '%') {
7648 v = getnextarg(args, arglen, &argidx);
7649 if (v == NULL)
7650 goto onError;
7651 }
7652 sign = 0;
7653 fill = ' ';
7654 switch (c) {
7655
7656 case '%':
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007657 pbuf = formatbuf;
7658 /* presume that buffer length is at least 1 */
7659 pbuf[0] = '%';
Guido van Rossumd57fd912000-03-10 22:53:23 +00007660 len = 1;
7661 break;
7662
7663 case 's':
7664 case 'r':
7665 if (PyUnicode_Check(v) && c == 's') {
7666 temp = v;
7667 Py_INCREF(temp);
7668 }
7669 else {
7670 PyObject *unicode;
7671 if (c == 's')
Marc-André Lemburgd25c6502004-07-23 16:13:25 +00007672 temp = PyObject_Unicode(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007673 else
7674 temp = PyObject_Repr(v);
7675 if (temp == NULL)
7676 goto onError;
Marc-André Lemburgd25c6502004-07-23 16:13:25 +00007677 if (PyUnicode_Check(temp))
7678 /* nothing to do */;
7679 else if (PyString_Check(temp)) {
7680 /* convert to string to Unicode */
Thomas Woutersa96affe2006-03-12 00:29:36 +00007681 unicode = PyUnicode_Decode(PyString_AS_STRING(temp),
Guido van Rossumd57fd912000-03-10 22:53:23 +00007682 PyString_GET_SIZE(temp),
Thomas Woutersa96affe2006-03-12 00:29:36 +00007683 NULL,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007684 "strict");
Thomas Woutersa96affe2006-03-12 00:29:36 +00007685 Py_DECREF(temp);
7686 temp = unicode;
7687 if (temp == NULL)
7688 goto onError;
7689 }
Marc-André Lemburgd25c6502004-07-23 16:13:25 +00007690 else {
7691 Py_DECREF(temp);
7692 PyErr_SetString(PyExc_TypeError,
7693 "%s argument has non-string str()");
7694 goto onError;
7695 }
7696 }
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007697 pbuf = PyUnicode_AS_UNICODE(temp);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007698 len = PyUnicode_GET_SIZE(temp);
7699 if (prec >= 0 && len > prec)
7700 len = prec;
7701 break;
7702
7703 case 'i':
7704 case 'd':
7705 case 'u':
7706 case 'o':
7707 case 'x':
7708 case 'X':
7709 if (c == 'i')
7710 c = 'd';
Tim Petersa3a3a032000-11-30 05:22:44 +00007711 if (PyLong_Check(v)) {
Tim Peters38fd5b62000-09-21 05:43:11 +00007712 temp = formatlong(v, flags, prec, c);
7713 if (!temp)
7714 goto onError;
7715 pbuf = PyUnicode_AS_UNICODE(temp);
7716 len = PyUnicode_GET_SIZE(temp);
Tim Peters38fd5b62000-09-21 05:43:11 +00007717 sign = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007718 }
Tim Peters38fd5b62000-09-21 05:43:11 +00007719 else {
7720 pbuf = formatbuf;
7721 len = formatint(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
7722 flags, prec, c, v);
7723 if (len < 0)
7724 goto onError;
Guido van Rossum6c9e1302003-11-29 23:52:13 +00007725 sign = 1;
Tim Peters38fd5b62000-09-21 05:43:11 +00007726 }
7727 if (flags & F_ZERO)
7728 fill = '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00007729 break;
7730
7731 case 'e':
7732 case 'E':
7733 case 'f':
Raymond Hettinger9bfe5332003-08-27 04:55:52 +00007734 case 'F':
Guido van Rossumd57fd912000-03-10 22:53:23 +00007735 case 'g':
7736 case 'G':
Raymond Hettinger9bfe5332003-08-27 04:55:52 +00007737 if (c == 'F')
7738 c = 'f';
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007739 pbuf = formatbuf;
7740 len = formatfloat(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
7741 flags, prec, c, v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007742 if (len < 0)
7743 goto onError;
7744 sign = 1;
Tim Peters38fd5b62000-09-21 05:43:11 +00007745 if (flags & F_ZERO)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007746 fill = '0';
7747 break;
7748
7749 case 'c':
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007750 pbuf = formatbuf;
7751 len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007752 if (len < 0)
7753 goto onError;
7754 break;
7755
7756 default:
7757 PyErr_Format(PyExc_ValueError,
Andrew M. Kuchling6ca89172000-12-15 13:07:46 +00007758 "unsupported format character '%c' (0x%x) "
Thomas Wouters89f507f2006-12-13 04:49:30 +00007759 "at index %zd",
Tim Petersced69f82003-09-16 20:30:58 +00007760 (31<=c && c<=126) ? (char)c : '?',
Marc-André Lemburg24e53b62002-09-24 09:32:14 +00007761 (int)c,
Thomas Wouters89f507f2006-12-13 04:49:30 +00007762 (Py_ssize_t)(fmt - 1 -
7763 PyUnicode_AS_UNICODE(uformat)));
Guido van Rossumd57fd912000-03-10 22:53:23 +00007764 goto onError;
7765 }
7766 if (sign) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007767 if (*pbuf == '-' || *pbuf == '+') {
7768 sign = *pbuf++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007769 len--;
7770 }
7771 else if (flags & F_SIGN)
7772 sign = '+';
7773 else if (flags & F_BLANK)
7774 sign = ' ';
7775 else
7776 sign = 0;
7777 }
7778 if (width < len)
7779 width = len;
Guido van Rossum049cd6b2002-10-11 00:43:48 +00007780 if (rescnt - (sign != 0) < width) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007781 reslen -= rescnt;
7782 rescnt = width + fmtcnt + 100;
7783 reslen += rescnt;
Guido van Rossum049cd6b2002-10-11 00:43:48 +00007784 if (reslen < 0) {
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00007785 Py_XDECREF(temp);
Thomas Woutersa96affe2006-03-12 00:29:36 +00007786 PyErr_NoMemory();
7787 goto onError;
Guido van Rossum049cd6b2002-10-11 00:43:48 +00007788 }
Thomas Woutersa96affe2006-03-12 00:29:36 +00007789 if (_PyUnicode_Resize(&result, reslen) < 0) {
7790 Py_XDECREF(temp);
7791 goto onError;
7792 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007793 res = PyUnicode_AS_UNICODE(result)
7794 + reslen - rescnt;
7795 }
7796 if (sign) {
7797 if (fill != ' ')
7798 *res++ = sign;
7799 rescnt--;
7800 if (width > len)
7801 width--;
7802 }
Tim Peters38fd5b62000-09-21 05:43:11 +00007803 if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
7804 assert(pbuf[0] == '0');
Tim Petersfff53252001-04-12 18:38:48 +00007805 assert(pbuf[1] == c);
7806 if (fill != ' ') {
7807 *res++ = *pbuf++;
7808 *res++ = *pbuf++;
Tim Peters38fd5b62000-09-21 05:43:11 +00007809 }
Tim Petersfff53252001-04-12 18:38:48 +00007810 rescnt -= 2;
7811 width -= 2;
7812 if (width < 0)
7813 width = 0;
7814 len -= 2;
Tim Peters38fd5b62000-09-21 05:43:11 +00007815 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007816 if (width > len && !(flags & F_LJUST)) {
7817 do {
7818 --rescnt;
7819 *res++ = fill;
7820 } while (--width > len);
7821 }
Tim Peters38fd5b62000-09-21 05:43:11 +00007822 if (fill == ' ') {
7823 if (sign)
7824 *res++ = sign;
Tim Petersfff53252001-04-12 18:38:48 +00007825 if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
Tim Peters38fd5b62000-09-21 05:43:11 +00007826 assert(pbuf[0] == '0');
Tim Petersfff53252001-04-12 18:38:48 +00007827 assert(pbuf[1] == c);
Tim Peters38fd5b62000-09-21 05:43:11 +00007828 *res++ = *pbuf++;
7829 *res++ = *pbuf++;
7830 }
7831 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00007832 Py_UNICODE_COPY(res, pbuf, len);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007833 res += len;
7834 rescnt -= len;
7835 while (--width >= len) {
7836 --rescnt;
7837 *res++ = ' ';
7838 }
7839 if (dict && (argidx < arglen) && c != '%') {
7840 PyErr_SetString(PyExc_TypeError,
Raymond Hettinger0ebac972002-05-21 15:14:57 +00007841 "not all arguments converted during string formatting");
Thomas Woutersa96affe2006-03-12 00:29:36 +00007842 Py_XDECREF(temp);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007843 goto onError;
7844 }
7845 Py_XDECREF(temp);
7846 } /* '%' */
7847 } /* until end */
7848 if (argidx < arglen && !dict) {
7849 PyErr_SetString(PyExc_TypeError,
Raymond Hettinger0ebac972002-05-21 15:14:57 +00007850 "not all arguments converted during string formatting");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007851 goto onError;
7852 }
7853
Thomas Woutersa96affe2006-03-12 00:29:36 +00007854 if (_PyUnicode_Resize(&result, reslen - rescnt) < 0)
7855 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007856 if (args_owned) {
7857 Py_DECREF(args);
7858 }
7859 Py_DECREF(uformat);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007860 return (PyObject *)result;
7861
7862 onError:
7863 Py_XDECREF(result);
7864 Py_DECREF(uformat);
7865 if (args_owned) {
7866 Py_DECREF(args);
7867 }
7868 return NULL;
7869}
7870
7871static PyBufferProcs unicode_as_buffer = {
Martin v. Löwis18e16552006-02-15 17:27:45 +00007872 (readbufferproc) unicode_buffer_getreadbuf,
7873 (writebufferproc) unicode_buffer_getwritebuf,
7874 (segcountproc) unicode_buffer_getsegcount,
7875 (charbufferproc) unicode_buffer_getcharbuf,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007876};
7877
Jeremy Hylton938ace62002-07-17 16:30:39 +00007878static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +00007879unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
7880
Tim Peters6d6c1a32001-08-02 04:15:00 +00007881static PyObject *
7882unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
7883{
7884 PyObject *x = NULL;
Martin v. Löwis15e62742006-02-27 16:46:16 +00007885 static char *kwlist[] = {"string", "encoding", "errors", 0};
Tim Peters6d6c1a32001-08-02 04:15:00 +00007886 char *encoding = NULL;
7887 char *errors = NULL;
7888
Guido van Rossume023fe02001-08-30 03:12:59 +00007889 if (type != &PyUnicode_Type)
7890 return unicode_subtype_new(type, args, kwds);
Tim Peters6d6c1a32001-08-02 04:15:00 +00007891 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:unicode",
7892 kwlist, &x, &encoding, &errors))
7893 return NULL;
7894 if (x == NULL)
7895 return (PyObject *)_PyUnicode_New(0);
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00007896 if (encoding == NULL && errors == NULL)
7897 return PyObject_Unicode(x);
7898 else
Tim Peters6d6c1a32001-08-02 04:15:00 +00007899 return PyUnicode_FromEncodedObject(x, encoding, errors);
7900}
7901
Guido van Rossume023fe02001-08-30 03:12:59 +00007902static PyObject *
7903unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
7904{
Tim Petersaf90b3e2001-09-12 05:18:58 +00007905 PyUnicodeObject *tmp, *pnew;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007906 Py_ssize_t n;
Guido van Rossume023fe02001-08-30 03:12:59 +00007907
7908 assert(PyType_IsSubtype(type, &PyUnicode_Type));
7909 tmp = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds);
7910 if (tmp == NULL)
7911 return NULL;
7912 assert(PyUnicode_Check(tmp));
Tim Petersaf90b3e2001-09-12 05:18:58 +00007913 pnew = (PyUnicodeObject *) type->tp_alloc(type, n = tmp->length);
Raymond Hettingerf4667932003-06-28 20:04:25 +00007914 if (pnew == NULL) {
7915 Py_DECREF(tmp);
Guido van Rossume023fe02001-08-30 03:12:59 +00007916 return NULL;
Raymond Hettingerf4667932003-06-28 20:04:25 +00007917 }
Tim Petersaf90b3e2001-09-12 05:18:58 +00007918 pnew->str = PyMem_NEW(Py_UNICODE, n+1);
7919 if (pnew->str == NULL) {
7920 _Py_ForgetReference((PyObject *)pnew);
Neil Schemenauer58aa8612002-04-12 03:07:20 +00007921 PyObject_Del(pnew);
Raymond Hettingerf4667932003-06-28 20:04:25 +00007922 Py_DECREF(tmp);
Neal Norwitzec74f2f2003-02-11 23:05:40 +00007923 return PyErr_NoMemory();
Guido van Rossume023fe02001-08-30 03:12:59 +00007924 }
Tim Petersaf90b3e2001-09-12 05:18:58 +00007925 Py_UNICODE_COPY(pnew->str, tmp->str, n+1);
7926 pnew->length = n;
7927 pnew->hash = tmp->hash;
Guido van Rossume023fe02001-08-30 03:12:59 +00007928 Py_DECREF(tmp);
Tim Petersaf90b3e2001-09-12 05:18:58 +00007929 return (PyObject *)pnew;
Guido van Rossume023fe02001-08-30 03:12:59 +00007930}
7931
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007932PyDoc_STRVAR(unicode_doc,
Tim Peters6d6c1a32001-08-02 04:15:00 +00007933"unicode(string [, encoding[, errors]]) -> object\n\
7934\n\
7935Create a new Unicode object from the given encoded string.\n\
Skip Montanaro35b37a52002-07-26 16:22:46 +00007936encoding defaults to the current default string encoding.\n\
7937errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +00007938
Guido van Rossum50e9fb92006-08-17 05:42:55 +00007939static PyObject *unicode_iter(PyObject *seq);
7940
Guido van Rossumd57fd912000-03-10 22:53:23 +00007941PyTypeObject PyUnicode_Type = {
7942 PyObject_HEAD_INIT(&PyType_Type)
7943 0, /* ob_size */
Guido van Rossum84fc66d2007-05-03 17:18:26 +00007944 "str", /* tp_name */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007945 sizeof(PyUnicodeObject), /* tp_size */
7946 0, /* tp_itemsize */
7947 /* Slots */
Guido van Rossum9475a232001-10-05 20:51:39 +00007948 (destructor)unicode_dealloc, /* tp_dealloc */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007949 0, /* tp_print */
Tim Peters6d6c1a32001-08-02 04:15:00 +00007950 0, /* tp_getattr */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007951 0, /* tp_setattr */
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00007952 0, /* tp_compare */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007953 unicode_repr, /* tp_repr */
Neil Schemenauerce30bc92002-11-18 16:10:18 +00007954 &unicode_as_number, /* tp_as_number */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007955 &unicode_as_sequence, /* tp_as_sequence */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007956 &unicode_as_mapping, /* tp_as_mapping */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007957 (hashfunc) unicode_hash, /* tp_hash*/
7958 0, /* tp_call*/
7959 (reprfunc) unicode_str, /* tp_str */
Tim Peters6d6c1a32001-08-02 04:15:00 +00007960 PyObject_GenericGetAttr, /* tp_getattro */
7961 0, /* tp_setattro */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007962 &unicode_as_buffer, /* tp_as_buffer */
Thomas Wouters27d517b2007-02-25 20:39:11 +00007963 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
7964 Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
Tim Peters6d6c1a32001-08-02 04:15:00 +00007965 unicode_doc, /* tp_doc */
7966 0, /* tp_traverse */
7967 0, /* tp_clear */
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00007968 PyUnicode_RichCompare, /* tp_richcompare */
Tim Peters6d6c1a32001-08-02 04:15:00 +00007969 0, /* tp_weaklistoffset */
Guido van Rossum50e9fb92006-08-17 05:42:55 +00007970 unicode_iter, /* tp_iter */
Tim Peters6d6c1a32001-08-02 04:15:00 +00007971 0, /* tp_iternext */
7972 unicode_methods, /* tp_methods */
7973 0, /* tp_members */
7974 0, /* tp_getset */
Guido van Rossumcacfc072002-05-24 19:01:59 +00007975 &PyBaseString_Type, /* tp_base */
Tim Peters6d6c1a32001-08-02 04:15:00 +00007976 0, /* tp_dict */
7977 0, /* tp_descr_get */
7978 0, /* tp_descr_set */
7979 0, /* tp_dictoffset */
7980 0, /* tp_init */
7981 0, /* tp_alloc */
7982 unicode_new, /* tp_new */
Neil Schemenauer58aa8612002-04-12 03:07:20 +00007983 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007984};
7985
7986/* Initialize the Unicode implementation */
7987
Thomas Wouters78890102000-07-22 19:25:51 +00007988void _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007989{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00007990 int i;
7991
Thomas Wouters477c8d52006-05-27 19:21:47 +00007992 /* XXX - move this array to unicodectype.c ? */
7993 Py_UNICODE linebreak[] = {
7994 0x000A, /* LINE FEED */
7995 0x000D, /* CARRIAGE RETURN */
7996 0x001C, /* FILE SEPARATOR */
7997 0x001D, /* GROUP SEPARATOR */
7998 0x001E, /* RECORD SEPARATOR */
7999 0x0085, /* NEXT LINE */
8000 0x2028, /* LINE SEPARATOR */
8001 0x2029, /* PARAGRAPH SEPARATOR */
8002 };
8003
Fred Drakee4315f52000-05-09 19:53:39 +00008004 /* Init the implementation */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008005 unicode_freelist = NULL;
8006 unicode_freelist_size = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008007 unicode_empty = _PyUnicode_New(0);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008008 if (!unicode_empty)
8009 return;
8010
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008011 for (i = 0; i < 256; i++)
8012 unicode_latin1[i] = NULL;
Guido van Rossumcacfc072002-05-24 19:01:59 +00008013 if (PyType_Ready(&PyUnicode_Type) < 0)
8014 Py_FatalError("Can't initialize 'unicode'");
Thomas Wouters477c8d52006-05-27 19:21:47 +00008015
8016 /* initialize the linebreak bloom filter */
8017 bloom_linebreak = make_bloom_mask(
8018 linebreak, sizeof(linebreak) / sizeof(linebreak[0])
8019 );
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008020
8021 PyType_Ready(&EncodingMapType);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008022}
8023
8024/* Finalize the Unicode implementation */
8025
8026void
Thomas Wouters78890102000-07-22 19:25:51 +00008027_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008028{
Barry Warsaw5b4c2282000-10-03 20:45:26 +00008029 PyUnicodeObject *u;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008030 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008031
Guido van Rossum4ae8ef82000-10-03 18:09:04 +00008032 Py_XDECREF(unicode_empty);
8033 unicode_empty = NULL;
Barry Warsaw5b4c2282000-10-03 20:45:26 +00008034
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008035 for (i = 0; i < 256; i++) {
8036 if (unicode_latin1[i]) {
8037 Py_DECREF(unicode_latin1[i]);
8038 unicode_latin1[i] = NULL;
8039 }
8040 }
8041
Barry Warsaw5b4c2282000-10-03 20:45:26 +00008042 for (u = unicode_freelist; u != NULL;) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00008043 PyUnicodeObject *v = u;
8044 u = *(PyUnicodeObject **)u;
Guido van Rossumfd4b9572000-04-10 13:51:10 +00008045 if (v->str)
Guido van Rossumb18618d2000-05-03 23:44:39 +00008046 PyMem_DEL(v->str);
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00008047 Py_XDECREF(v->defenc);
Neil Schemenauer58aa8612002-04-12 03:07:20 +00008048 PyObject_Del(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008049 }
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008050 unicode_freelist = NULL;
8051 unicode_freelist_size = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008052}
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00008053
Guido van Rossum50e9fb92006-08-17 05:42:55 +00008054
8055
8056/********************* Unicode Iterator **************************/
8057
8058typedef struct {
8059 PyObject_HEAD
Guido van Rossum49d6b072006-08-17 21:11:47 +00008060 Py_ssize_t it_index;
Guido van Rossum50e9fb92006-08-17 05:42:55 +00008061 PyUnicodeObject *it_seq; /* Set to NULL when iterator is exhausted */
8062} unicodeiterobject;
8063
8064static void
8065unicodeiter_dealloc(unicodeiterobject *it)
8066{
8067 _PyObject_GC_UNTRACK(it);
8068 Py_XDECREF(it->it_seq);
8069 PyObject_GC_Del(it);
8070}
8071
8072static int
8073unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
8074{
8075 Py_VISIT(it->it_seq);
8076 return 0;
8077}
8078
8079static PyObject *
8080unicodeiter_next(unicodeiterobject *it)
8081{
8082 PyUnicodeObject *seq;
8083 PyObject *item;
8084
8085 assert(it != NULL);
8086 seq = it->it_seq;
8087 if (seq == NULL)
8088 return NULL;
8089 assert(PyUnicode_Check(seq));
8090
8091 if (it->it_index < PyUnicode_GET_SIZE(seq)) {
Guido van Rossum49d6b072006-08-17 21:11:47 +00008092 item = PyUnicode_FromUnicode(
8093 PyUnicode_AS_UNICODE(seq)+it->it_index, 1);
Guido van Rossum50e9fb92006-08-17 05:42:55 +00008094 if (item != NULL)
8095 ++it->it_index;
8096 return item;
8097 }
8098
8099 Py_DECREF(seq);
8100 it->it_seq = NULL;
8101 return NULL;
8102}
8103
8104static PyObject *
8105unicodeiter_len(unicodeiterobject *it)
8106{
8107 Py_ssize_t len = 0;
8108 if (it->it_seq)
8109 len = PyUnicode_GET_SIZE(it->it_seq) - it->it_index;
8110 return PyInt_FromSsize_t(len);
8111}
8112
8113PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
8114
8115static PyMethodDef unicodeiter_methods[] = {
Guido van Rossum49d6b072006-08-17 21:11:47 +00008116 {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
8117 length_hint_doc},
Guido van Rossum50e9fb92006-08-17 05:42:55 +00008118 {NULL, NULL} /* sentinel */
8119};
8120
8121PyTypeObject PyUnicodeIter_Type = {
8122 PyObject_HEAD_INIT(&PyType_Type)
8123 0, /* ob_size */
8124 "unicodeiterator", /* tp_name */
8125 sizeof(unicodeiterobject), /* tp_basicsize */
8126 0, /* tp_itemsize */
8127 /* methods */
Guido van Rossum49d6b072006-08-17 21:11:47 +00008128 (destructor)unicodeiter_dealloc, /* tp_dealloc */
Guido van Rossum50e9fb92006-08-17 05:42:55 +00008129 0, /* tp_print */
8130 0, /* tp_getattr */
8131 0, /* tp_setattr */
8132 0, /* tp_compare */
8133 0, /* tp_repr */
8134 0, /* tp_as_number */
8135 0, /* tp_as_sequence */
8136 0, /* tp_as_mapping */
8137 0, /* tp_hash */
8138 0, /* tp_call */
8139 0, /* tp_str */
8140 PyObject_GenericGetAttr, /* tp_getattro */
8141 0, /* tp_setattro */
8142 0, /* tp_as_buffer */
8143 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
8144 0, /* tp_doc */
8145 (traverseproc)unicodeiter_traverse, /* tp_traverse */
8146 0, /* tp_clear */
8147 0, /* tp_richcompare */
8148 0, /* tp_weaklistoffset */
8149 PyObject_SelfIter, /* tp_iter */
8150 (iternextfunc)unicodeiter_next, /* tp_iternext */
8151 unicodeiter_methods, /* tp_methods */
8152 0,
8153};
8154
8155static PyObject *
8156unicode_iter(PyObject *seq)
8157{
8158 unicodeiterobject *it;
8159
8160 if (!PyUnicode_Check(seq)) {
8161 PyErr_BadInternalCall();
8162 return NULL;
8163 }
8164 it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
8165 if (it == NULL)
8166 return NULL;
8167 it->it_index = 0;
8168 Py_INCREF(seq);
8169 it->it_seq = (PyUnicodeObject *)seq;
8170 _PyObject_GC_TRACK(it);
8171 return (PyObject *)it;
8172}
8173
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008174#ifdef __cplusplus
8175}
8176#endif
8177
8178
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00008179/*
8180Local variables:
8181c-basic-offset: 4
8182indent-tabs-mode: nil
8183End:
8184*/