blob: db3f9c4e41989d23817921680f7cd80eeecfa212 [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Fred Drake785d14f2000-05-09 19:54:43 +00004modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
Guido van Rossumd57fd912000-03-10 22:53:23 +00005Unicode Integration Proposal (see file Misc/unicode.txt).
6
Thomas Wouters477c8d52006-05-27 19:21:47 +00007Major speed upgrades to the method implementations at the Reykjavik
8NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
9
Guido van Rossum16b1ad92000-08-03 16:24:25 +000010Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000011
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000012--------------------------------------------------------------------
13The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000014
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000015 Copyright (c) 1999 by Secret Labs AB
16 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000017
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000018By obtaining, using, and/or copying this software and/or its
19associated documentation, you agree that you have read, understood,
20and will comply with the following terms and conditions:
21
22Permission to use, copy, modify, and distribute this software and its
23associated documentation for any purpose and without fee is hereby
24granted, provided that the above copyright notice appears in all
25copies, and that both that copyright notice and this permission notice
26appear in supporting documentation, and that the name of Secret Labs
27AB or the author not be used in advertising or publicity pertaining to
28distribution of the software without specific, written prior
29permission.
30
31SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
32THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
33FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
34ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
35WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
36ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
37OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
38--------------------------------------------------------------------
39
40*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000041
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000042#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000043#include "Python.h"
44
Guido van Rossumd57fd912000-03-10 22:53:23 +000045#include "unicodeobject.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000046#include "ucnhash.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000047
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000048#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000049#include <windows.h>
50#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000051
Guido van Rossumd57fd912000-03-10 22:53:23 +000052/* Limit for the Unicode object free list */
53
54#define MAX_UNICODE_FREELIST_SIZE 1024
55
56/* Limit for the Unicode object free list stay alive optimization.
57
58 The implementation will keep allocated Unicode memory intact for
59 all objects on the free list having a size less than this
Tim Petersced69f82003-09-16 20:30:58 +000060 limit. This reduces malloc() overhead for small Unicode objects.
Guido van Rossumd57fd912000-03-10 22:53:23 +000061
Barry Warsaw51ac5802000-03-20 16:36:48 +000062 At worst this will result in MAX_UNICODE_FREELIST_SIZE *
Guido van Rossumfd4b9572000-04-10 13:51:10 +000063 (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
Guido van Rossumd57fd912000-03-10 22:53:23 +000064 malloc()-overhead) bytes of unused garbage.
65
66 Setting the limit to 0 effectively turns the feature off.
67
Guido van Rossumfd4b9572000-04-10 13:51:10 +000068 Note: This is an experimental feature ! If you get core dumps when
69 using Unicode objects, turn this feature off.
Guido van Rossumd57fd912000-03-10 22:53:23 +000070
71*/
72
Guido van Rossumfd4b9572000-04-10 13:51:10 +000073#define KEEPALIVE_SIZE_LIMIT 9
Guido van Rossumd57fd912000-03-10 22:53:23 +000074
75/* Endianness switches; defaults to little endian */
76
77#ifdef WORDS_BIGENDIAN
78# define BYTEORDER_IS_BIG_ENDIAN
79#else
80# define BYTEORDER_IS_LITTLE_ENDIAN
81#endif
82
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000083/* --- Globals ------------------------------------------------------------
84
85 The globals are initialized by the _PyUnicode_Init() API and should
86 not be used before calling that API.
87
88*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000089
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000090
91#ifdef __cplusplus
92extern "C" {
93#endif
94
Guido van Rossumd57fd912000-03-10 22:53:23 +000095/* Free list for Unicode objects */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000096static PyUnicodeObject *unicode_freelist;
97static int unicode_freelist_size;
Guido van Rossumd57fd912000-03-10 22:53:23 +000098
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000099/* The empty Unicode object is shared to improve performance. */
100static PyUnicodeObject *unicode_empty;
101
102/* Single character Unicode strings in the Latin-1 range are being
103 shared as well. */
104static PyUnicodeObject *unicode_latin1[256];
105
Fred Drakee4315f52000-05-09 19:53:39 +0000106/* Default encoding to use and assume when NULL is passed as encoding
Guido van Rossumf15a29f2007-05-04 00:41:39 +0000107 parameter; it is fixed to "utf-8". Always use the
108 PyUnicode_GetDefaultEncoding() API to access this global. */
109static const char unicode_default_encoding[] = "utf-8";
Fred Drakee4315f52000-05-09 19:53:39 +0000110
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000111Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000112PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000113{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000114#ifdef Py_UNICODE_WIDE
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000115 return 0x10FFFF;
116#else
117 /* This is actually an illegal character, so it should
118 not be passed to unichr. */
119 return 0xFFFF;
120#endif
121}
122
Thomas Wouters477c8d52006-05-27 19:21:47 +0000123/* --- Bloom Filters ----------------------------------------------------- */
124
125/* stuff to implement simple "bloom filters" for Unicode characters.
126 to keep things simple, we use a single bitmask, using the least 5
127 bits from each unicode characters as the bit index. */
128
129/* the linebreak mask is set up by Unicode_Init below */
130
131#define BLOOM_MASK unsigned long
132
133static BLOOM_MASK bloom_linebreak;
134
135#define BLOOM(mask, ch) ((mask & (1 << ((ch) & 0x1F))))
136
137#define BLOOM_LINEBREAK(ch)\
138 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK((ch)))
139
140Py_LOCAL_INLINE(BLOOM_MASK) make_bloom_mask(Py_UNICODE* ptr, Py_ssize_t len)
141{
142 /* calculate simple bloom-style bitmask for a given unicode string */
143
144 long mask;
145 Py_ssize_t i;
146
147 mask = 0;
148 for (i = 0; i < len; i++)
149 mask |= (1 << (ptr[i] & 0x1F));
150
151 return mask;
152}
153
154Py_LOCAL_INLINE(int) unicode_member(Py_UNICODE chr, Py_UNICODE* set, Py_ssize_t setlen)
155{
156 Py_ssize_t i;
157
158 for (i = 0; i < setlen; i++)
159 if (set[i] == chr)
160 return 1;
161
162 return 0;
163}
164
165#define BLOOM_MEMBER(mask, chr, set, setlen)\
166 BLOOM(mask, chr) && unicode_member(chr, set, setlen)
167
Guido van Rossumd57fd912000-03-10 22:53:23 +0000168/* --- Unicode Object ----------------------------------------------------- */
169
170static
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000171int unicode_resize(register PyUnicodeObject *unicode,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000172 Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000173{
174 void *oldstr;
Tim Petersced69f82003-09-16 20:30:58 +0000175
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000176 /* Shortcut if there's nothing much to do. */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000177 if (unicode->length == length)
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000178 goto reset;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000179
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000180 /* Resizing shared object (unicode_empty or single character
181 objects) in-place is not allowed. Use PyUnicode_Resize()
182 instead ! */
Thomas Wouters477c8d52006-05-27 19:21:47 +0000183
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000184 if (unicode == unicode_empty ||
185 (unicode->length == 1 &&
186 unicode->str[0] < 256U &&
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000187 unicode_latin1[unicode->str[0]] == unicode)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000188 PyErr_SetString(PyExc_SystemError,
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000189 "can't resize shared unicode objects");
Guido van Rossumd57fd912000-03-10 22:53:23 +0000190 return -1;
191 }
192
Thomas Wouters477c8d52006-05-27 19:21:47 +0000193 /* We allocate one more byte to make sure the string is Ux0000 terminated.
194 The overallocation is also used by fastsearch, which assumes that it's
195 safe to look at str[length] (without making any assumptions about what
196 it contains). */
197
Guido van Rossumd57fd912000-03-10 22:53:23 +0000198 oldstr = unicode->str;
199 PyMem_RESIZE(unicode->str, Py_UNICODE, length + 1);
200 if (!unicode->str) {
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000201 unicode->str = (Py_UNICODE *)oldstr;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000202 PyErr_NoMemory();
203 return -1;
204 }
205 unicode->str[length] = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000206 unicode->length = length;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000207
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000208 reset:
Guido van Rossumd57fd912000-03-10 22:53:23 +0000209 /* Reset the object caches */
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000210 if (unicode->defenc) {
211 Py_DECREF(unicode->defenc);
212 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000213 }
214 unicode->hash = -1;
Tim Petersced69f82003-09-16 20:30:58 +0000215
Guido van Rossumd57fd912000-03-10 22:53:23 +0000216 return 0;
217}
218
219/* We allocate one more byte to make sure the string is
Tim Petersced69f82003-09-16 20:30:58 +0000220 Ux0000 terminated -- XXX is this needed ?
Guido van Rossumd57fd912000-03-10 22:53:23 +0000221
222 XXX This allocator could further be enhanced by assuring that the
223 free list never reduces its size below 1.
224
225*/
226
227static
Martin v. Löwis18e16552006-02-15 17:27:45 +0000228PyUnicodeObject *_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000229{
230 register PyUnicodeObject *unicode;
231
Thomas Wouters477c8d52006-05-27 19:21:47 +0000232 /* Optimization for empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000233 if (length == 0 && unicode_empty != NULL) {
234 Py_INCREF(unicode_empty);
235 return unicode_empty;
236 }
237
238 /* Unicode freelist & memory allocation */
239 if (unicode_freelist) {
240 unicode = unicode_freelist;
Marc-André Lemburgbea47e72000-06-17 20:31:17 +0000241 unicode_freelist = *(PyUnicodeObject **)unicode;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000242 unicode_freelist_size--;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000243 if (unicode->str) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000244 /* Keep-Alive optimization: we only upsize the buffer,
245 never downsize it. */
Tim Petersced69f82003-09-16 20:30:58 +0000246 if ((unicode->length < length) &&
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +0000247 unicode_resize(unicode, length) < 0) {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000248 PyMem_DEL(unicode->str);
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000249 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000250 }
251 }
Guido van Rossumad98db12001-06-14 17:52:02 +0000252 else {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000253 unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
Guido van Rossumad98db12001-06-14 17:52:02 +0000254 }
255 PyObject_INIT(unicode, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000256 }
257 else {
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000258 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000259 if (unicode == NULL)
260 return NULL;
261 unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
262 }
263
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000264 if (!unicode->str) {
265 PyErr_NoMemory();
Barry Warsaw51ac5802000-03-20 16:36:48 +0000266 goto onError;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000267 }
Jeremy Hyltond8082792003-09-16 19:41:39 +0000268 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +0000269 * the caller fails before initializing str -- unicode_resize()
270 * reads str[0], and the Keep-Alive optimization can keep memory
271 * allocated for str alive across a call to unicode_dealloc(unicode).
272 * We don't want unicode_resize to read uninitialized memory in
273 * that case.
274 */
Jeremy Hyltond8082792003-09-16 19:41:39 +0000275 unicode->str[0] = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000276 unicode->str[length] = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000277 unicode->length = length;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000278 unicode->hash = -1;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000279 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000280 return unicode;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000281
282 onError:
283 _Py_ForgetReference((PyObject *)unicode);
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000284 PyObject_Del(unicode);
Barry Warsaw51ac5802000-03-20 16:36:48 +0000285 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000286}
287
288static
Guido van Rossum9475a232001-10-05 20:51:39 +0000289void unicode_dealloc(register PyUnicodeObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000290{
Guido van Rossum604ddf82001-12-06 20:03:56 +0000291 if (PyUnicode_CheckExact(unicode) &&
292 unicode_freelist_size < MAX_UNICODE_FREELIST_SIZE) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000293 /* Keep-Alive optimization */
294 if (unicode->length >= KEEPALIVE_SIZE_LIMIT) {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000295 PyMem_DEL(unicode->str);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000296 unicode->str = NULL;
297 unicode->length = 0;
298 }
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000299 if (unicode->defenc) {
300 Py_DECREF(unicode->defenc);
301 unicode->defenc = NULL;
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000302 }
303 /* Add to free list */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000304 *(PyUnicodeObject **)unicode = unicode_freelist;
305 unicode_freelist = unicode;
306 unicode_freelist_size++;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000307 }
308 else {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000309 PyMem_DEL(unicode->str);
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000310 Py_XDECREF(unicode->defenc);
Guido van Rossum604ddf82001-12-06 20:03:56 +0000311 unicode->ob_type->tp_free((PyObject *)unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000312 }
313}
314
Martin v. Löwis18e16552006-02-15 17:27:45 +0000315int PyUnicode_Resize(PyObject **unicode, Py_ssize_t length)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000316{
317 register PyUnicodeObject *v;
318
319 /* Argument checks */
320 if (unicode == NULL) {
321 PyErr_BadInternalCall();
322 return -1;
323 }
324 v = (PyUnicodeObject *)*unicode;
Guido van Rossum049cd6b2002-10-11 00:43:48 +0000325 if (v == NULL || !PyUnicode_Check(v) || v->ob_refcnt != 1 || length < 0) {
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000326 PyErr_BadInternalCall();
327 return -1;
328 }
329
330 /* Resizing unicode_empty and single character objects is not
331 possible since these are being shared. We simply return a fresh
332 copy with the same Unicode content. */
Tim Petersced69f82003-09-16 20:30:58 +0000333 if (v->length != length &&
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000334 (v == unicode_empty || v->length == 1)) {
335 PyUnicodeObject *w = _PyUnicode_New(length);
336 if (w == NULL)
337 return -1;
338 Py_UNICODE_COPY(w->str, v->str,
339 length < v->length ? length : v->length);
Raymond Hettingerc8df5782003-03-09 07:30:43 +0000340 Py_DECREF(*unicode);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000341 *unicode = (PyObject *)w;
342 return 0;
343 }
344
345 /* Note that we don't have to modify *unicode for unshared Unicode
346 objects, since we can modify them in-place. */
347 return unicode_resize(v, length);
348}
349
350/* Internal API for use in unicodeobject.c only ! */
351#define _PyUnicode_Resize(unicodevar, length) \
352 PyUnicode_Resize(((PyObject **)(unicodevar)), length)
353
Guido van Rossumd57fd912000-03-10 22:53:23 +0000354PyObject *PyUnicode_FromUnicode(const Py_UNICODE *u,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000355 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000356{
357 PyUnicodeObject *unicode;
358
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000359 /* If the Unicode data is known at construction time, we can apply
360 some optimizations which share commonly used objects. */
361 if (u != NULL) {
362
363 /* Optimization for empty strings */
364 if (size == 0 && unicode_empty != NULL) {
365 Py_INCREF(unicode_empty);
366 return (PyObject *)unicode_empty;
367 }
368
369 /* Single character Unicode objects in the Latin-1 range are
370 shared when using this constructor */
371 if (size == 1 && *u < 256) {
372 unicode = unicode_latin1[*u];
373 if (!unicode) {
374 unicode = _PyUnicode_New(1);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000375 if (!unicode)
376 return NULL;
Marc-André Lemburg8879a332001-06-07 12:26:56 +0000377 unicode->str[0] = *u;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000378 unicode_latin1[*u] = unicode;
379 }
380 Py_INCREF(unicode);
381 return (PyObject *)unicode;
382 }
383 }
Tim Petersced69f82003-09-16 20:30:58 +0000384
Guido van Rossumd57fd912000-03-10 22:53:23 +0000385 unicode = _PyUnicode_New(size);
386 if (!unicode)
387 return NULL;
388
389 /* Copy the Unicode data into the new object */
390 if (u != NULL)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000391 Py_UNICODE_COPY(unicode->str, u, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000392
393 return (PyObject *)unicode;
394}
395
396#ifdef HAVE_WCHAR_H
397
398PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000399 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000400{
401 PyUnicodeObject *unicode;
402
403 if (w == NULL) {
404 PyErr_BadInternalCall();
405 return NULL;
406 }
407
408 unicode = _PyUnicode_New(size);
409 if (!unicode)
410 return NULL;
411
412 /* Copy the wchar_t data into the new object */
413#ifdef HAVE_USABLE_WCHAR_T
414 memcpy(unicode->str, w, size * sizeof(wchar_t));
Tim Petersced69f82003-09-16 20:30:58 +0000415#else
Guido van Rossumd57fd912000-03-10 22:53:23 +0000416 {
417 register Py_UNICODE *u;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000418 register Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000419 u = PyUnicode_AS_UNICODE(unicode);
Marc-André Lemburg204bd6d2004-10-15 07:45:05 +0000420 for (i = size; i > 0; i--)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000421 *u++ = *w++;
422 }
423#endif
424
425 return (PyObject *)unicode;
426}
427
Martin v. Löwis18e16552006-02-15 17:27:45 +0000428Py_ssize_t PyUnicode_AsWideChar(PyUnicodeObject *unicode,
429 wchar_t *w,
430 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000431{
432 if (unicode == NULL) {
433 PyErr_BadInternalCall();
434 return -1;
435 }
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +0000436
437 /* If possible, try to copy the 0-termination as well */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000438 if (size > PyUnicode_GET_SIZE(unicode))
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +0000439 size = PyUnicode_GET_SIZE(unicode) + 1;
440
Guido van Rossumd57fd912000-03-10 22:53:23 +0000441#ifdef HAVE_USABLE_WCHAR_T
442 memcpy(w, unicode->str, size * sizeof(wchar_t));
443#else
444 {
445 register Py_UNICODE *u;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000446 register Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000447 u = PyUnicode_AS_UNICODE(unicode);
Marc-André Lemburg204bd6d2004-10-15 07:45:05 +0000448 for (i = size; i > 0; i--)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000449 *w++ = *u++;
450 }
451#endif
452
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +0000453 if (size > PyUnicode_GET_SIZE(unicode))
454 return PyUnicode_GET_SIZE(unicode);
455 else
Guido van Rossumd57fd912000-03-10 22:53:23 +0000456 return size;
457}
458
459#endif
460
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000461PyObject *PyUnicode_FromOrdinal(int ordinal)
462{
Hye-Shik Chang40574832004-04-06 07:24:51 +0000463 Py_UNICODE s[1];
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000464
465#ifdef Py_UNICODE_WIDE
466 if (ordinal < 0 || ordinal > 0x10ffff) {
467 PyErr_SetString(PyExc_ValueError,
468 "unichr() arg not in range(0x110000) "
469 "(wide Python build)");
470 return NULL;
471 }
472#else
473 if (ordinal < 0 || ordinal > 0xffff) {
474 PyErr_SetString(PyExc_ValueError,
475 "unichr() arg not in range(0x10000) "
476 "(narrow Python build)");
477 return NULL;
478 }
479#endif
480
Hye-Shik Chang40574832004-04-06 07:24:51 +0000481 s[0] = (Py_UNICODE)ordinal;
482 return PyUnicode_FromUnicode(s, 1);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000483}
484
Guido van Rossumd57fd912000-03-10 22:53:23 +0000485PyObject *PyUnicode_FromObject(register PyObject *obj)
486{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000487 /* XXX Perhaps we should make this API an alias of
488 PyObject_Unicode() instead ?! */
489 if (PyUnicode_CheckExact(obj)) {
490 Py_INCREF(obj);
491 return obj;
492 }
493 if (PyUnicode_Check(obj)) {
494 /* For a Unicode subtype that's not a Unicode object,
495 return a true Unicode object with the same data. */
496 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(obj),
497 PyUnicode_GET_SIZE(obj));
498 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000499 return PyUnicode_FromEncodedObject(obj, NULL, "strict");
500}
501
502PyObject *PyUnicode_FromEncodedObject(register PyObject *obj,
503 const char *encoding,
504 const char *errors)
505{
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000506 const char *s = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +0000507 Py_ssize_t len;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000508 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +0000509
Guido van Rossumd57fd912000-03-10 22:53:23 +0000510 if (obj == NULL) {
511 PyErr_BadInternalCall();
512 return NULL;
513 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000514
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000515#if 0
516 /* For b/w compatibility we also accept Unicode objects provided
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +0000517 that no encodings is given and then redirect to
518 PyObject_Unicode() which then applies the additional logic for
519 Unicode subclasses.
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000520
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000521 NOTE: This API should really only be used for object which
522 represent *encoded* Unicode !
523
524 */
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000525 if (PyUnicode_Check(obj)) {
526 if (encoding) {
527 PyErr_SetString(PyExc_TypeError,
528 "decoding Unicode is not supported");
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000529 return NULL;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000530 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000531 return PyObject_Unicode(obj);
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000532 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000533#else
534 if (PyUnicode_Check(obj)) {
535 PyErr_SetString(PyExc_TypeError,
536 "decoding Unicode is not supported");
537 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000538 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000539#endif
540
541 /* Coerce object */
542 if (PyString_Check(obj)) {
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000543 s = PyString_AS_STRING(obj);
544 len = PyString_GET_SIZE(obj);
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000545 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000546 else if (PyObject_AsCharBuffer(obj, &s, &len)) {
547 /* Overwrite the error message with something more useful in
548 case of a TypeError. */
549 if (PyErr_ExceptionMatches(PyExc_TypeError))
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000550 PyErr_Format(PyExc_TypeError,
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000551 "coercing to Unicode: need string or buffer, "
552 "%.80s found",
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000553 obj->ob_type->tp_name);
554 goto onError;
555 }
Tim Petersced69f82003-09-16 20:30:58 +0000556
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000557 /* Convert to Unicode */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000558 if (len == 0) {
559 Py_INCREF(unicode_empty);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000560 v = (PyObject *)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000561 }
Tim Petersced69f82003-09-16 20:30:58 +0000562 else
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000563 v = PyUnicode_Decode(s, len, encoding, errors);
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +0000564
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000565 return v;
566
567 onError:
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000568 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000569}
570
571PyObject *PyUnicode_Decode(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000572 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +0000573 const char *encoding,
574 const char *errors)
575{
576 PyObject *buffer = NULL, *unicode;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000577
578 if (encoding == NULL)
Fred Drakee4315f52000-05-09 19:53:39 +0000579 encoding = PyUnicode_GetDefaultEncoding();
580
581 /* Shortcuts for common default encodings */
582 if (strcmp(encoding, "utf-8") == 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000583 return PyUnicode_DecodeUTF8(s, size, errors);
Fred Drakee4315f52000-05-09 19:53:39 +0000584 else if (strcmp(encoding, "latin-1") == 0)
585 return PyUnicode_DecodeLatin1(s, size, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +0000586#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
587 else if (strcmp(encoding, "mbcs") == 0)
588 return PyUnicode_DecodeMBCS(s, size, errors);
589#endif
Fred Drakee4315f52000-05-09 19:53:39 +0000590 else if (strcmp(encoding, "ascii") == 0)
591 return PyUnicode_DecodeASCII(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000592
593 /* Decode via the codec registry */
594 buffer = PyBuffer_FromMemory((void *)s, size);
595 if (buffer == NULL)
596 goto onError;
597 unicode = PyCodec_Decode(buffer, encoding, errors);
598 if (unicode == NULL)
599 goto onError;
600 if (!PyUnicode_Check(unicode)) {
601 PyErr_Format(PyExc_TypeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000602 "decoder did not return an unicode object (type=%.400s)",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000603 unicode->ob_type->tp_name);
604 Py_DECREF(unicode);
605 goto onError;
606 }
607 Py_DECREF(buffer);
608 return unicode;
Tim Petersced69f82003-09-16 20:30:58 +0000609
Guido van Rossumd57fd912000-03-10 22:53:23 +0000610 onError:
611 Py_XDECREF(buffer);
612 return NULL;
613}
614
Marc-André Lemburgd2d45982004-07-08 17:57:32 +0000615PyObject *PyUnicode_AsDecodedObject(PyObject *unicode,
616 const char *encoding,
617 const char *errors)
618{
619 PyObject *v;
620
621 if (!PyUnicode_Check(unicode)) {
622 PyErr_BadArgument();
623 goto onError;
624 }
625
626 if (encoding == NULL)
627 encoding = PyUnicode_GetDefaultEncoding();
628
629 /* Decode via the codec registry */
630 v = PyCodec_Decode(unicode, encoding, errors);
631 if (v == NULL)
632 goto onError;
633 return v;
634
635 onError:
636 return NULL;
637}
638
Guido van Rossumd57fd912000-03-10 22:53:23 +0000639PyObject *PyUnicode_Encode(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000640 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +0000641 const char *encoding,
642 const char *errors)
643{
644 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +0000645
Guido van Rossumd57fd912000-03-10 22:53:23 +0000646 unicode = PyUnicode_FromUnicode(s, size);
647 if (unicode == NULL)
648 return NULL;
649 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
650 Py_DECREF(unicode);
651 return v;
652}
653
Marc-André Lemburgd2d45982004-07-08 17:57:32 +0000654PyObject *PyUnicode_AsEncodedObject(PyObject *unicode,
655 const char *encoding,
656 const char *errors)
657{
658 PyObject *v;
659
660 if (!PyUnicode_Check(unicode)) {
661 PyErr_BadArgument();
662 goto onError;
663 }
664
665 if (encoding == NULL)
666 encoding = PyUnicode_GetDefaultEncoding();
667
668 /* Encode via the codec registry */
669 v = PyCodec_Encode(unicode, encoding, errors);
670 if (v == NULL)
671 goto onError;
672 return v;
673
674 onError:
675 return NULL;
676}
677
Guido van Rossumd57fd912000-03-10 22:53:23 +0000678PyObject *PyUnicode_AsEncodedString(PyObject *unicode,
679 const char *encoding,
680 const char *errors)
681{
682 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +0000683
Guido van Rossumd57fd912000-03-10 22:53:23 +0000684 if (!PyUnicode_Check(unicode)) {
685 PyErr_BadArgument();
686 goto onError;
687 }
Fred Drakee4315f52000-05-09 19:53:39 +0000688
Tim Petersced69f82003-09-16 20:30:58 +0000689 if (encoding == NULL)
Fred Drakee4315f52000-05-09 19:53:39 +0000690 encoding = PyUnicode_GetDefaultEncoding();
691
692 /* Shortcuts for common default encodings */
693 if (errors == NULL) {
694 if (strcmp(encoding, "utf-8") == 0)
Jeremy Hylton9cea41c2001-05-29 17:13:15 +0000695 return PyUnicode_AsUTF8String(unicode);
Fred Drakee4315f52000-05-09 19:53:39 +0000696 else if (strcmp(encoding, "latin-1") == 0)
697 return PyUnicode_AsLatin1String(unicode);
Mark Hammond0ccda1e2003-07-01 00:13:27 +0000698#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
699 else if (strcmp(encoding, "mbcs") == 0)
700 return PyUnicode_AsMBCSString(unicode);
701#endif
Fred Drakee4315f52000-05-09 19:53:39 +0000702 else if (strcmp(encoding, "ascii") == 0)
703 return PyUnicode_AsASCIIString(unicode);
704 }
Guido van Rossumd57fd912000-03-10 22:53:23 +0000705
706 /* Encode via the codec registry */
707 v = PyCodec_Encode(unicode, encoding, errors);
708 if (v == NULL)
709 goto onError;
Guido van Rossumf15a29f2007-05-04 00:41:39 +0000710 if (!PyBytes_Check(v)) {
711 if (PyString_Check(v)) {
712 /* Old codec, turn it into bytes */
713 PyObject *b = PyBytes_FromObject(v);
714 Py_DECREF(v);
715 return b;
716 }
Guido van Rossumd57fd912000-03-10 22:53:23 +0000717 PyErr_Format(PyExc_TypeError,
Guido van Rossumf15a29f2007-05-04 00:41:39 +0000718 "encoder did not return a bytes object "
719 "(type=%.400s, encoding=%.20s, errors=%.20s)",
720 v->ob_type->tp_name,
721 encoding ? encoding : "NULL",
722 errors ? errors : "NULL");
Guido van Rossumd57fd912000-03-10 22:53:23 +0000723 Py_DECREF(v);
724 goto onError;
725 }
726 return v;
Tim Petersced69f82003-09-16 20:30:58 +0000727
Guido van Rossumd57fd912000-03-10 22:53:23 +0000728 onError:
729 return NULL;
730}
731
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000732PyObject *_PyUnicode_AsDefaultEncodedString(PyObject *unicode,
733 const char *errors)
734{
735 PyObject *v = ((PyUnicodeObject *)unicode)->defenc;
Guido van Rossumf15a29f2007-05-04 00:41:39 +0000736 PyObject *b;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000737 if (v)
738 return v;
Guido van Rossumf15a29f2007-05-04 00:41:39 +0000739 if (errors != NULL)
740 Py_FatalError("non-NULL encoding in _PyUnicode_AsDefaultEncodedString");
741 if (errors == NULL) {
742 b = PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
743 PyUnicode_GET_SIZE(unicode),
744 NULL);
745 }
746 else {
747 b = PyUnicode_AsEncodedString(unicode, NULL, errors);
748 }
749 if (!b)
750 return NULL;
751 v = PyString_FromStringAndSize(PyBytes_AsString(b),
752 PyBytes_Size(b));
753 Py_DECREF(b);
754 if (!errors) {
755 Py_XINCREF(v);
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000756 ((PyUnicodeObject *)unicode)->defenc = v;
Guido van Rossumf15a29f2007-05-04 00:41:39 +0000757 }
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000758 return v;
759}
760
Guido van Rossumd57fd912000-03-10 22:53:23 +0000761Py_UNICODE *PyUnicode_AsUnicode(PyObject *unicode)
762{
763 if (!PyUnicode_Check(unicode)) {
764 PyErr_BadArgument();
765 goto onError;
766 }
767 return PyUnicode_AS_UNICODE(unicode);
768
769 onError:
770 return NULL;
771}
772
Martin v. Löwis18e16552006-02-15 17:27:45 +0000773Py_ssize_t PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000774{
775 if (!PyUnicode_Check(unicode)) {
776 PyErr_BadArgument();
777 goto onError;
778 }
779 return PyUnicode_GET_SIZE(unicode);
780
781 onError:
782 return -1;
783}
784
Thomas Wouters78890102000-07-22 19:25:51 +0000785const char *PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +0000786{
787 return unicode_default_encoding;
788}
789
790int PyUnicode_SetDefaultEncoding(const char *encoding)
791{
Guido van Rossumf15a29f2007-05-04 00:41:39 +0000792 if (strcmp(encoding, unicode_default_encoding) != 0) {
793 PyErr_Format(PyExc_ValueError,
794 "Can only set default encoding to %s",
795 unicode_default_encoding);
796 return -1;
797 }
Fred Drakee4315f52000-05-09 19:53:39 +0000798 return 0;
Fred Drakee4315f52000-05-09 19:53:39 +0000799}
800
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000801/* error handling callback helper:
802 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +0000803 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000804 and adjust various state variables.
805 return 0 on success, -1 on error
806*/
807
808static
809int unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
810 const char *encoding, const char *reason,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000811 const char *input, Py_ssize_t insize, Py_ssize_t *startinpos, Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
812 PyObject **output, Py_ssize_t *outpos, Py_UNICODE **outptr)
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000813{
Martin v. Löwis18e16552006-02-15 17:27:45 +0000814 static char *argparse = "O!n;decoding error handler must return (unicode, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000815
816 PyObject *restuple = NULL;
817 PyObject *repunicode = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +0000818 Py_ssize_t outsize = PyUnicode_GET_SIZE(*output);
819 Py_ssize_t requiredsize;
820 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000821 Py_UNICODE *repptr;
Martin v. Löwis18e16552006-02-15 17:27:45 +0000822 Py_ssize_t repsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000823 int res = -1;
824
825 if (*errorHandler == NULL) {
826 *errorHandler = PyCodec_LookupError(errors);
827 if (*errorHandler == NULL)
828 goto onError;
829 }
830
831 if (*exceptionObject == NULL) {
832 *exceptionObject = PyUnicodeDecodeError_Create(
833 encoding, input, insize, *startinpos, *endinpos, reason);
834 if (*exceptionObject == NULL)
835 goto onError;
836 }
837 else {
838 if (PyUnicodeDecodeError_SetStart(*exceptionObject, *startinpos))
839 goto onError;
840 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, *endinpos))
841 goto onError;
842 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
843 goto onError;
844 }
845
846 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
847 if (restuple == NULL)
848 goto onError;
849 if (!PyTuple_Check(restuple)) {
850 PyErr_Format(PyExc_TypeError, &argparse[4]);
851 goto onError;
852 }
853 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
854 goto onError;
855 if (newpos<0)
Walter Dörwald2e0b18a2003-01-31 17:19:08 +0000856 newpos = insize+newpos;
857 if (newpos<0 || newpos>insize) {
Martin v. Löwis2c95cc62006-02-16 06:54:25 +0000858 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
Walter Dörwald2e0b18a2003-01-31 17:19:08 +0000859 goto onError;
860 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000861
862 /* need more space? (at least enough for what we
863 have+the replacement+the rest of the string (starting
864 at the new input position), so we won't have to check space
865 when there are no errors in the rest of the string) */
866 repptr = PyUnicode_AS_UNICODE(repunicode);
867 repsize = PyUnicode_GET_SIZE(repunicode);
868 requiredsize = *outpos + repsize + insize-newpos;
869 if (requiredsize > outsize) {
870 if (requiredsize<2*outsize)
871 requiredsize = 2*outsize;
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +0000872 if (PyUnicode_Resize(output, requiredsize) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000873 goto onError;
874 *outptr = PyUnicode_AS_UNICODE(*output) + *outpos;
875 }
876 *endinpos = newpos;
877 *inptr = input + newpos;
878 Py_UNICODE_COPY(*outptr, repptr, repsize);
879 *outptr += repsize;
880 *outpos += repsize;
881 /* we made it! */
882 res = 0;
883
884 onError:
885 Py_XDECREF(restuple);
886 return res;
887}
888
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000889/* --- UTF-7 Codec -------------------------------------------------------- */
890
891/* see RFC2152 for details */
892
Tim Petersced69f82003-09-16 20:30:58 +0000893static
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000894char utf7_special[128] = {
895 /* indicate whether a UTF-7 character is special i.e. cannot be directly
896 encoded:
897 0 - not special
898 1 - special
899 2 - whitespace (optional)
900 3 - RFC2152 Set O (optional) */
901 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 2, 1, 1,
902 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
903 2, 3, 3, 3, 3, 3, 3, 0, 0, 0, 3, 1, 0, 0, 0, 1,
904 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 0,
905 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
906 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 1, 3, 3, 3,
907 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
908 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 1, 1,
909
910};
911
Marc-André Lemburge115ec82005-10-19 22:33:31 +0000912/* Note: The comparison (c) <= 0 is a trick to work-around gcc
913 warnings about the comparison always being false; since
914 utf7_special[0] is 1, we can safely make that one comparison
915 true */
916
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000917#define SPECIAL(c, encodeO, encodeWS) \
Marc-André Lemburge115ec82005-10-19 22:33:31 +0000918 ((c) > 127 || (c) <= 0 || utf7_special[(c)] == 1 || \
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +0000919 (encodeWS && (utf7_special[(c)] == 2)) || \
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000920 (encodeO && (utf7_special[(c)] == 3)))
921
Marc-André Lemburge115ec82005-10-19 22:33:31 +0000922#define B64(n) \
923 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
924#define B64CHAR(c) \
925 (isalnum(c) || (c) == '+' || (c) == '/')
926#define UB64(c) \
927 ((c) == '+' ? 62 : (c) == '/' ? 63 : (c) >= 'a' ? \
928 (c) - 71 : (c) >= 'A' ? (c) - 65 : (c) + 4 )
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000929
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +0000930#define ENCODE(out, ch, bits) \
931 while (bits >= 6) { \
932 *out++ = B64(ch >> (bits-6)); \
933 bits -= 6; \
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000934 }
935
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +0000936#define DECODE(out, ch, bits, surrogate) \
937 while (bits >= 16) { \
938 Py_UNICODE outCh = (Py_UNICODE) ((ch >> (bits-16)) & 0xffff); \
939 bits -= 16; \
940 if (surrogate) { \
Marc-André Lemburge115ec82005-10-19 22:33:31 +0000941 /* We have already generated an error for the high surrogate \
942 so let's not bother seeing if the low surrogate is correct or not */ \
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +0000943 surrogate = 0; \
944 } else if (0xDC00 <= outCh && outCh <= 0xDFFF) { \
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000945 /* This is a surrogate pair. Unfortunately we can't represent \
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +0000946 it in a 16-bit character */ \
947 surrogate = 1; \
948 errmsg = "code pairs are not supported"; \
949 goto utf7Error; \
950 } else { \
951 *out++ = outCh; \
952 } \
Marc-André Lemburge115ec82005-10-19 22:33:31 +0000953 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000954
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000955PyObject *PyUnicode_DecodeUTF7(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000956 Py_ssize_t size,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000957 const char *errors)
958{
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000959 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +0000960 Py_ssize_t startinpos;
961 Py_ssize_t endinpos;
962 Py_ssize_t outpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000963 const char *e;
964 PyUnicodeObject *unicode;
965 Py_UNICODE *p;
966 const char *errmsg = "";
967 int inShift = 0;
968 unsigned int bitsleft = 0;
969 unsigned long charsleft = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000970 int surrogate = 0;
971 PyObject *errorHandler = NULL;
972 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000973
974 unicode = _PyUnicode_New(size);
975 if (!unicode)
976 return NULL;
977 if (size == 0)
978 return (PyObject *)unicode;
979
980 p = unicode->str;
981 e = s + size;
982
983 while (s < e) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000984 Py_UNICODE ch;
985 restart:
986 ch = *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000987
988 if (inShift) {
989 if ((ch == '-') || !B64CHAR(ch)) {
990 inShift = 0;
991 s++;
Tim Petersced69f82003-09-16 20:30:58 +0000992
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000993 /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
994 if (bitsleft >= 6) {
995 /* The shift sequence has a partial character in it. If
996 bitsleft < 6 then we could just classify it as padding
997 but that is not the case here */
998
999 errmsg = "partial character in shift sequence";
Tim Petersced69f82003-09-16 20:30:58 +00001000 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001001 }
1002 /* According to RFC2152 the remaining bits should be zero. We
Tim Petersced69f82003-09-16 20:30:58 +00001003 choose to signal an error/insert a replacement character
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001004 here so indicate the potential of a misencoded character. */
1005
1006 /* On x86, a << b == a << (b%32) so make sure that bitsleft != 0 */
1007 if (bitsleft && charsleft << (sizeof(charsleft) * 8 - bitsleft)) {
1008 errmsg = "non-zero padding bits in shift sequence";
Tim Petersced69f82003-09-16 20:30:58 +00001009 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001010 }
1011
1012 if (ch == '-') {
1013 if ((s < e) && (*(s) == '-')) {
Tim Petersced69f82003-09-16 20:30:58 +00001014 *p++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001015 inShift = 1;
1016 }
1017 } else if (SPECIAL(ch,0,0)) {
1018 errmsg = "unexpected special character";
Tim Petersced69f82003-09-16 20:30:58 +00001019 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001020 } else {
1021 *p++ = ch;
1022 }
1023 } else {
1024 charsleft = (charsleft << 6) | UB64(ch);
1025 bitsleft += 6;
1026 s++;
1027 /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
1028 }
1029 }
1030 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001031 startinpos = s-starts;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001032 s++;
1033 if (s < e && *s == '-') {
1034 s++;
1035 *p++ = '+';
1036 } else
1037 {
1038 inShift = 1;
1039 bitsleft = 0;
1040 }
1041 }
1042 else if (SPECIAL(ch,0,0)) {
1043 errmsg = "unexpected special character";
1044 s++;
Tim Petersced69f82003-09-16 20:30:58 +00001045 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001046 }
1047 else {
1048 *p++ = ch;
1049 s++;
1050 }
1051 continue;
1052 utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001053 outpos = p-PyUnicode_AS_UNICODE(unicode);
1054 endinpos = s-starts;
1055 if (unicode_decode_call_errorhandler(
1056 errors, &errorHandler,
1057 "utf7", errmsg,
1058 starts, size, &startinpos, &endinpos, &exc, &s,
1059 (PyObject **)&unicode, &outpos, &p))
1060 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001061 }
1062
1063 if (inShift) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001064 outpos = p-PyUnicode_AS_UNICODE(unicode);
1065 endinpos = size;
1066 if (unicode_decode_call_errorhandler(
1067 errors, &errorHandler,
1068 "utf7", "unterminated shift sequence",
1069 starts, size, &startinpos, &endinpos, &exc, &s,
1070 (PyObject **)&unicode, &outpos, &p))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001071 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001072 if (s < e)
1073 goto restart;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001074 }
1075
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00001076 if (_PyUnicode_Resize(&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001077 goto onError;
1078
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001079 Py_XDECREF(errorHandler);
1080 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001081 return (PyObject *)unicode;
1082
1083onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001084 Py_XDECREF(errorHandler);
1085 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001086 Py_DECREF(unicode);
1087 return NULL;
1088}
1089
1090
1091PyObject *PyUnicode_EncodeUTF7(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001092 Py_ssize_t size,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001093 int encodeSetO,
1094 int encodeWhiteSpace,
1095 const char *errors)
1096{
1097 PyObject *v;
1098 /* It might be possible to tighten this worst case */
Martin v. Löwis18e16552006-02-15 17:27:45 +00001099 Py_ssize_t cbAllocated = 5 * size;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001100 int inShift = 0;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001101 Py_ssize_t i = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001102 unsigned int bitsleft = 0;
1103 unsigned long charsleft = 0;
1104 char * out;
1105 char * start;
1106
1107 if (size == 0)
1108 return PyString_FromStringAndSize(NULL, 0);
1109
1110 v = PyString_FromStringAndSize(NULL, cbAllocated);
1111 if (v == NULL)
1112 return NULL;
1113
1114 start = out = PyString_AS_STRING(v);
1115 for (;i < size; ++i) {
1116 Py_UNICODE ch = s[i];
1117
1118 if (!inShift) {
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001119 if (ch == '+') {
1120 *out++ = '+';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001121 *out++ = '-';
1122 } else if (SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
1123 charsleft = ch;
1124 bitsleft = 16;
1125 *out++ = '+';
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001126 /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001127 inShift = bitsleft > 0;
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001128 } else {
1129 *out++ = (char) ch;
1130 }
1131 } else {
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001132 if (!SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
1133 *out++ = B64(charsleft << (6-bitsleft));
1134 charsleft = 0;
1135 bitsleft = 0;
1136 /* Characters not in the BASE64 set implicitly unshift the sequence
1137 so no '-' is required, except if the character is itself a '-' */
1138 if (B64CHAR(ch) || ch == '-') {
1139 *out++ = '-';
1140 }
1141 inShift = 0;
1142 *out++ = (char) ch;
1143 } else {
1144 bitsleft += 16;
1145 charsleft = (charsleft << 16) | ch;
1146 /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
1147
1148 /* If the next character is special then we dont' need to terminate
Tim Petersced69f82003-09-16 20:30:58 +00001149 the shift sequence. If the next character is not a BASE64 character
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001150 or '-' then the shift sequence will be terminated implicitly and we
1151 don't have to insert a '-'. */
1152
1153 if (bitsleft == 0) {
1154 if (i + 1 < size) {
1155 Py_UNICODE ch2 = s[i+1];
1156
1157 if (SPECIAL(ch2, encodeSetO, encodeWhiteSpace)) {
Tim Petersced69f82003-09-16 20:30:58 +00001158
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001159 } else if (B64CHAR(ch2) || ch2 == '-') {
1160 *out++ = '-';
1161 inShift = 0;
1162 } else {
1163 inShift = 0;
1164 }
1165
1166 }
1167 else {
1168 *out++ = '-';
1169 inShift = 0;
1170 }
1171 }
Tim Petersced69f82003-09-16 20:30:58 +00001172 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001173 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001174 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001175 if (bitsleft) {
1176 *out++= B64(charsleft << (6-bitsleft) );
1177 *out++ = '-';
1178 }
1179
Tim Peters5de98422002-04-27 18:44:32 +00001180 _PyString_Resize(&v, out - start);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001181 return v;
1182}
1183
1184#undef SPECIAL
1185#undef B64
1186#undef B64CHAR
1187#undef UB64
1188#undef ENCODE
1189#undef DECODE
1190
Guido van Rossumd57fd912000-03-10 22:53:23 +00001191/* --- UTF-8 Codec -------------------------------------------------------- */
1192
Tim Petersced69f82003-09-16 20:30:58 +00001193static
Guido van Rossumd57fd912000-03-10 22:53:23 +00001194char utf8_code_length[256] = {
1195 /* Map UTF-8 encoded prefix byte to sequence length. zero means
1196 illegal prefix. see RFC 2279 for details */
1197 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1198 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1199 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1200 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1201 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1202 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1203 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1204 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1205 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1206 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1207 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1208 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1209 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1210 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1211 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
1212 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0
1213};
1214
Guido van Rossumd57fd912000-03-10 22:53:23 +00001215PyObject *PyUnicode_DecodeUTF8(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001216 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00001217 const char *errors)
1218{
Walter Dörwald69652032004-09-07 20:24:22 +00001219 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
1220}
1221
1222PyObject *PyUnicode_DecodeUTF8Stateful(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001223 Py_ssize_t size,
Walter Dörwald69652032004-09-07 20:24:22 +00001224 const char *errors,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001225 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00001226{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001227 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001228 int n;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001229 Py_ssize_t startinpos;
1230 Py_ssize_t endinpos;
1231 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001232 const char *e;
1233 PyUnicodeObject *unicode;
1234 Py_UNICODE *p;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001235 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001236 PyObject *errorHandler = NULL;
1237 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001238
1239 /* Note: size will always be longer than the resulting Unicode
1240 character count */
1241 unicode = _PyUnicode_New(size);
1242 if (!unicode)
1243 return NULL;
Walter Dörwald69652032004-09-07 20:24:22 +00001244 if (size == 0) {
1245 if (consumed)
1246 *consumed = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001247 return (PyObject *)unicode;
Walter Dörwald69652032004-09-07 20:24:22 +00001248 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001249
1250 /* Unpack UTF-8 encoded data */
1251 p = unicode->str;
1252 e = s + size;
1253
1254 while (s < e) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001255 Py_UCS4 ch = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001256
1257 if (ch < 0x80) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001258 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001259 s++;
1260 continue;
1261 }
1262
1263 n = utf8_code_length[ch];
1264
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001265 if (s + n > e) {
Walter Dörwald69652032004-09-07 20:24:22 +00001266 if (consumed)
1267 break;
1268 else {
1269 errmsg = "unexpected end of data";
1270 startinpos = s-starts;
1271 endinpos = size;
1272 goto utf8Error;
1273 }
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001274 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001275
1276 switch (n) {
1277
1278 case 0:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001279 errmsg = "unexpected code byte";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001280 startinpos = s-starts;
1281 endinpos = startinpos+1;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001282 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001283
1284 case 1:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001285 errmsg = "internal error";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001286 startinpos = s-starts;
1287 endinpos = startinpos+1;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001288 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001289
1290 case 2:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001291 if ((s[1] & 0xc0) != 0x80) {
1292 errmsg = "invalid data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001293 startinpos = s-starts;
1294 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001295 goto utf8Error;
1296 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001297 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001298 if (ch < 0x80) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001299 startinpos = s-starts;
1300 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001301 errmsg = "illegal encoding";
1302 goto utf8Error;
1303 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001304 else
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001305 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001306 break;
1307
1308 case 3:
Tim Petersced69f82003-09-16 20:30:58 +00001309 if ((s[1] & 0xc0) != 0x80 ||
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001310 (s[2] & 0xc0) != 0x80) {
1311 errmsg = "invalid data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001312 startinpos = s-starts;
1313 endinpos = startinpos+3;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001314 goto utf8Error;
1315 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001316 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001317 if (ch < 0x0800) {
1318 /* Note: UTF-8 encodings of surrogates are considered
Tim Petersced69f82003-09-16 20:30:58 +00001319 legal UTF-8 sequences;
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001320
1321 XXX For wide builds (UCS-4) we should probably try
1322 to recombine the surrogates into a single code
1323 unit.
1324 */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001325 errmsg = "illegal encoding";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001326 startinpos = s-starts;
1327 endinpos = startinpos+3;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001328 goto utf8Error;
1329 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001330 else
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001331 *p++ = (Py_UNICODE)ch;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001332 break;
1333
1334 case 4:
1335 if ((s[1] & 0xc0) != 0x80 ||
1336 (s[2] & 0xc0) != 0x80 ||
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001337 (s[3] & 0xc0) != 0x80) {
1338 errmsg = "invalid data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001339 startinpos = s-starts;
1340 endinpos = startinpos+4;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001341 goto utf8Error;
1342 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001343 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
1344 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
1345 /* validate and convert to UTF-16 */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001346 if ((ch < 0x10000) /* minimum value allowed for 4
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001347 byte encoding */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001348 || (ch > 0x10ffff)) /* maximum value allowed for
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001349 UTF-16 */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001350 {
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001351 errmsg = "illegal encoding";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001352 startinpos = s-starts;
1353 endinpos = startinpos+4;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001354 goto utf8Error;
1355 }
Fredrik Lundh8f455852001-06-27 18:59:43 +00001356#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001357 *p++ = (Py_UNICODE)ch;
1358#else
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001359 /* compute and append the two surrogates: */
Tim Petersced69f82003-09-16 20:30:58 +00001360
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001361 /* translate from 10000..10FFFF to 0..FFFF */
1362 ch -= 0x10000;
Tim Petersced69f82003-09-16 20:30:58 +00001363
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001364 /* high surrogate = top 10 bits added to D800 */
1365 *p++ = (Py_UNICODE)(0xD800 + (ch >> 10));
Tim Petersced69f82003-09-16 20:30:58 +00001366
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001367 /* low surrogate = bottom 10 bits added to DC00 */
Fredrik Lundh45714e92001-06-26 16:39:36 +00001368 *p++ = (Py_UNICODE)(0xDC00 + (ch & 0x03FF));
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001369#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00001370 break;
1371
1372 default:
1373 /* Other sizes are only needed for UCS-4 */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001374 errmsg = "unsupported Unicode code range";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001375 startinpos = s-starts;
1376 endinpos = startinpos+n;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001377 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001378 }
1379 s += n;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001380 continue;
Tim Petersced69f82003-09-16 20:30:58 +00001381
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001382 utf8Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001383 outpos = p-PyUnicode_AS_UNICODE(unicode);
1384 if (unicode_decode_call_errorhandler(
1385 errors, &errorHandler,
1386 "utf8", errmsg,
1387 starts, size, &startinpos, &endinpos, &exc, &s,
1388 (PyObject **)&unicode, &outpos, &p))
1389 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001390 }
Walter Dörwald69652032004-09-07 20:24:22 +00001391 if (consumed)
1392 *consumed = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001393
1394 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00001395 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001396 goto onError;
1397
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001398 Py_XDECREF(errorHandler);
1399 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001400 return (PyObject *)unicode;
1401
1402onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001403 Py_XDECREF(errorHandler);
1404 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001405 Py_DECREF(unicode);
1406 return NULL;
1407}
1408
Tim Peters602f7402002-04-27 18:03:26 +00001409/* Allocation strategy: if the string is short, convert into a stack buffer
1410 and allocate exactly as much space needed at the end. Else allocate the
1411 maximum possible needed (4 result bytes per Unicode character), and return
1412 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001413*/
Tim Peters7e3d9612002-04-21 03:26:37 +00001414PyObject *
1415PyUnicode_EncodeUTF8(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001416 Py_ssize_t size,
Tim Peters7e3d9612002-04-21 03:26:37 +00001417 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001418{
Tim Peters602f7402002-04-27 18:03:26 +00001419#define MAX_SHORT_UNICHARS 300 /* largest size we'll do on the stack */
Tim Peters0eca65c2002-04-21 17:28:06 +00001420
Martin v. Löwis18e16552006-02-15 17:27:45 +00001421 Py_ssize_t i; /* index into s of next input byte */
Tim Peters602f7402002-04-27 18:03:26 +00001422 PyObject *v; /* result string object */
1423 char *p; /* next free byte in output buffer */
Martin v. Löwis18e16552006-02-15 17:27:45 +00001424 Py_ssize_t nallocated; /* number of result bytes allocated */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001425 Py_ssize_t nneeded; /* number of result bytes needed */
Tim Peters602f7402002-04-27 18:03:26 +00001426 char stackbuf[MAX_SHORT_UNICHARS * 4];
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001427
Tim Peters602f7402002-04-27 18:03:26 +00001428 assert(s != NULL);
1429 assert(size >= 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001430
Tim Peters602f7402002-04-27 18:03:26 +00001431 if (size <= MAX_SHORT_UNICHARS) {
1432 /* Write into the stack buffer; nallocated can't overflow.
1433 * At the end, we'll allocate exactly as much heap space as it
1434 * turns out we need.
1435 */
1436 nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int);
1437 v = NULL; /* will allocate after we're done */
1438 p = stackbuf;
1439 }
1440 else {
1441 /* Overallocate on the heap, and give the excess back at the end. */
1442 nallocated = size * 4;
1443 if (nallocated / 4 != size) /* overflow! */
1444 return PyErr_NoMemory();
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001445 v = PyBytes_FromStringAndSize(NULL, nallocated);
Tim Peters602f7402002-04-27 18:03:26 +00001446 if (v == NULL)
1447 return NULL;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001448 p = PyBytes_AS_STRING(v);
Tim Peters602f7402002-04-27 18:03:26 +00001449 }
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001450
Tim Peters602f7402002-04-27 18:03:26 +00001451 for (i = 0; i < size;) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001452 Py_UCS4 ch = s[i++];
Marc-André Lemburg3688a882002-02-06 18:09:02 +00001453
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001454 if (ch < 0x80)
Tim Peters602f7402002-04-27 18:03:26 +00001455 /* Encode ASCII */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001456 *p++ = (char) ch;
Marc-André Lemburg3688a882002-02-06 18:09:02 +00001457
Guido van Rossumd57fd912000-03-10 22:53:23 +00001458 else if (ch < 0x0800) {
Tim Peters602f7402002-04-27 18:03:26 +00001459 /* Encode Latin-1 */
Marc-André Lemburgdc724d62002-02-06 18:20:19 +00001460 *p++ = (char)(0xc0 | (ch >> 6));
1461 *p++ = (char)(0x80 | (ch & 0x3f));
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001462 }
Marc-André Lemburg3688a882002-02-06 18:09:02 +00001463 else {
Tim Peters602f7402002-04-27 18:03:26 +00001464 /* Encode UCS2 Unicode ordinals */
1465 if (ch < 0x10000) {
1466 /* Special case: check for high surrogate */
1467 if (0xD800 <= ch && ch <= 0xDBFF && i != size) {
1468 Py_UCS4 ch2 = s[i];
1469 /* Check for low surrogate and combine the two to
1470 form a UCS4 value */
1471 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001472 ch = ((ch - 0xD800) << 10 | (ch2 - 0xDC00)) + 0x10000;
Tim Peters602f7402002-04-27 18:03:26 +00001473 i++;
1474 goto encodeUCS4;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001475 }
Tim Peters602f7402002-04-27 18:03:26 +00001476 /* Fall through: handles isolated high surrogates */
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001477 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001478 *p++ = (char)(0xe0 | (ch >> 12));
Tim Peters602f7402002-04-27 18:03:26 +00001479 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
1480 *p++ = (char)(0x80 | (ch & 0x3f));
1481 continue;
1482 }
1483encodeUCS4:
1484 /* Encode UCS4 Unicode ordinals */
1485 *p++ = (char)(0xf0 | (ch >> 18));
1486 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
1487 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
1488 *p++ = (char)(0x80 | (ch & 0x3f));
1489 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001490 }
Tim Peters0eca65c2002-04-21 17:28:06 +00001491
Tim Peters602f7402002-04-27 18:03:26 +00001492 if (v == NULL) {
1493 /* This was stack allocated. */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001494 nneeded = p - stackbuf;
Tim Peters602f7402002-04-27 18:03:26 +00001495 assert(nneeded <= nallocated);
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001496 v = PyBytes_FromStringAndSize(stackbuf, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00001497 }
1498 else {
1499 /* Cut back to size actually needed. */
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001500 nneeded = p - PyBytes_AS_STRING(v);
Tim Peters602f7402002-04-27 18:03:26 +00001501 assert(nneeded <= nallocated);
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001502 PyBytes_Resize(v, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00001503 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001504 return v;
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001505
Tim Peters602f7402002-04-27 18:03:26 +00001506#undef MAX_SHORT_UNICHARS
Guido van Rossumd57fd912000-03-10 22:53:23 +00001507}
1508
Guido van Rossumd57fd912000-03-10 22:53:23 +00001509PyObject *PyUnicode_AsUTF8String(PyObject *unicode)
1510{
Guido van Rossumd57fd912000-03-10 22:53:23 +00001511 if (!PyUnicode_Check(unicode)) {
1512 PyErr_BadArgument();
1513 return NULL;
1514 }
Barry Warsaw2dd4abf2000-08-18 06:58:15 +00001515 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
1516 PyUnicode_GET_SIZE(unicode),
1517 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001518}
1519
1520/* --- UTF-16 Codec ------------------------------------------------------- */
1521
Tim Peters772747b2001-08-09 22:21:55 +00001522PyObject *
1523PyUnicode_DecodeUTF16(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001524 Py_ssize_t size,
Tim Peters772747b2001-08-09 22:21:55 +00001525 const char *errors,
1526 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001527{
Walter Dörwald69652032004-09-07 20:24:22 +00001528 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
1529}
1530
1531PyObject *
1532PyUnicode_DecodeUTF16Stateful(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001533 Py_ssize_t size,
Walter Dörwald69652032004-09-07 20:24:22 +00001534 const char *errors,
1535 int *byteorder,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001536 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00001537{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001538 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001539 Py_ssize_t startinpos;
1540 Py_ssize_t endinpos;
1541 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001542 PyUnicodeObject *unicode;
1543 Py_UNICODE *p;
Tim Peters772747b2001-08-09 22:21:55 +00001544 const unsigned char *q, *e;
1545 int bo = 0; /* assume native ordering by default */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001546 const char *errmsg = "";
Tim Peters772747b2001-08-09 22:21:55 +00001547 /* Offsets from q for retrieving byte pairs in the right order. */
1548#ifdef BYTEORDER_IS_LITTLE_ENDIAN
1549 int ihi = 1, ilo = 0;
1550#else
1551 int ihi = 0, ilo = 1;
1552#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001553 PyObject *errorHandler = NULL;
1554 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001555
1556 /* Note: size will always be longer than the resulting Unicode
1557 character count */
1558 unicode = _PyUnicode_New(size);
1559 if (!unicode)
1560 return NULL;
1561 if (size == 0)
1562 return (PyObject *)unicode;
1563
1564 /* Unpack UTF-16 encoded data */
1565 p = unicode->str;
Tim Peters772747b2001-08-09 22:21:55 +00001566 q = (unsigned char *)s;
1567 e = q + size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001568
1569 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00001570 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001571
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001572 /* Check for BOM marks (U+FEFF) in the input and adjust current
1573 byte order setting accordingly. In native mode, the leading BOM
1574 mark is skipped, in all other modes, it is copied to the output
1575 stream as-is (giving a ZWNBSP character). */
1576 if (bo == 0) {
Walter Dörwald69652032004-09-07 20:24:22 +00001577 if (size >= 2) {
1578 const Py_UNICODE bom = (q[ihi] << 8) | q[ilo];
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001579#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Walter Dörwald69652032004-09-07 20:24:22 +00001580 if (bom == 0xFEFF) {
1581 q += 2;
1582 bo = -1;
1583 }
1584 else if (bom == 0xFFFE) {
1585 q += 2;
1586 bo = 1;
1587 }
Tim Petersced69f82003-09-16 20:30:58 +00001588#else
Walter Dörwald69652032004-09-07 20:24:22 +00001589 if (bom == 0xFEFF) {
1590 q += 2;
1591 bo = 1;
1592 }
1593 else if (bom == 0xFFFE) {
1594 q += 2;
1595 bo = -1;
1596 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001597#endif
Walter Dörwald69652032004-09-07 20:24:22 +00001598 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001599 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001600
Tim Peters772747b2001-08-09 22:21:55 +00001601 if (bo == -1) {
1602 /* force LE */
1603 ihi = 1;
1604 ilo = 0;
1605 }
1606 else if (bo == 1) {
1607 /* force BE */
1608 ihi = 0;
1609 ilo = 1;
1610 }
1611
1612 while (q < e) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001613 Py_UNICODE ch;
Walter Dörwald69652032004-09-07 20:24:22 +00001614 /* remaining bytes at the end? (size should be even) */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001615 if (e-q<2) {
Walter Dörwald69652032004-09-07 20:24:22 +00001616 if (consumed)
1617 break;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001618 errmsg = "truncated data";
1619 startinpos = ((const char *)q)-starts;
1620 endinpos = ((const char *)e)-starts;
1621 goto utf16Error;
1622 /* The remaining input chars are ignored if the callback
1623 chooses to skip the input */
1624 }
1625 ch = (q[ihi] << 8) | q[ilo];
1626
Tim Peters772747b2001-08-09 22:21:55 +00001627 q += 2;
1628
Guido van Rossumd57fd912000-03-10 22:53:23 +00001629 if (ch < 0xD800 || ch > 0xDFFF) {
1630 *p++ = ch;
1631 continue;
1632 }
1633
1634 /* UTF-16 code pair: */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001635 if (q >= e) {
1636 errmsg = "unexpected end of data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001637 startinpos = (((const char *)q)-2)-starts;
1638 endinpos = ((const char *)e)-starts;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001639 goto utf16Error;
1640 }
Martin v. Löwisac93bc22001-06-26 22:43:40 +00001641 if (0xD800 <= ch && ch <= 0xDBFF) {
Tim Peters772747b2001-08-09 22:21:55 +00001642 Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo];
1643 q += 2;
Martin v. Löwisac93bc22001-06-26 22:43:40 +00001644 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Fredrik Lundh8f455852001-06-27 18:59:43 +00001645#ifndef Py_UNICODE_WIDE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001646 *p++ = ch;
1647 *p++ = ch2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001648#else
1649 *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001650#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001651 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001652 }
1653 else {
1654 errmsg = "illegal UTF-16 surrogate";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001655 startinpos = (((const char *)q)-4)-starts;
1656 endinpos = startinpos+2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001657 goto utf16Error;
1658 }
1659
Guido van Rossumd57fd912000-03-10 22:53:23 +00001660 }
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001661 errmsg = "illegal encoding";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001662 startinpos = (((const char *)q)-2)-starts;
1663 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001664 /* Fall through to report the error */
1665
1666 utf16Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001667 outpos = p-PyUnicode_AS_UNICODE(unicode);
1668 if (unicode_decode_call_errorhandler(
1669 errors, &errorHandler,
1670 "utf16", errmsg,
1671 starts, size, &startinpos, &endinpos, &exc, (const char **)&q,
1672 (PyObject **)&unicode, &outpos, &p))
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001673 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001674 }
1675
1676 if (byteorder)
1677 *byteorder = bo;
1678
Walter Dörwald69652032004-09-07 20:24:22 +00001679 if (consumed)
1680 *consumed = (const char *)q-starts;
1681
Guido van Rossumd57fd912000-03-10 22:53:23 +00001682 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00001683 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001684 goto onError;
1685
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001686 Py_XDECREF(errorHandler);
1687 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001688 return (PyObject *)unicode;
1689
1690onError:
1691 Py_DECREF(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001692 Py_XDECREF(errorHandler);
1693 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001694 return NULL;
1695}
1696
Tim Peters772747b2001-08-09 22:21:55 +00001697PyObject *
1698PyUnicode_EncodeUTF16(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001699 Py_ssize_t size,
Tim Peters772747b2001-08-09 22:21:55 +00001700 const char *errors,
1701 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001702{
1703 PyObject *v;
Tim Peters772747b2001-08-09 22:21:55 +00001704 unsigned char *p;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00001705#ifdef Py_UNICODE_WIDE
Tim Peters772747b2001-08-09 22:21:55 +00001706 int i, pairs;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00001707#else
1708 const int pairs = 0;
1709#endif
Tim Peters772747b2001-08-09 22:21:55 +00001710 /* Offsets from p for storing byte pairs in the right order. */
1711#ifdef BYTEORDER_IS_LITTLE_ENDIAN
1712 int ihi = 1, ilo = 0;
1713#else
1714 int ihi = 0, ilo = 1;
1715#endif
1716
1717#define STORECHAR(CH) \
1718 do { \
1719 p[ihi] = ((CH) >> 8) & 0xff; \
1720 p[ilo] = (CH) & 0xff; \
1721 p += 2; \
1722 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001723
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00001724#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001725 for (i = pairs = 0; i < size; i++)
1726 if (s[i] >= 0x10000)
1727 pairs++;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00001728#endif
Tim Petersced69f82003-09-16 20:30:58 +00001729 v = PyString_FromStringAndSize(NULL,
Tim Peters772747b2001-08-09 22:21:55 +00001730 2 * (size + pairs + (byteorder == 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +00001731 if (v == NULL)
1732 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001733
Tim Peters772747b2001-08-09 22:21:55 +00001734 p = (unsigned char *)PyString_AS_STRING(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001735 if (byteorder == 0)
Tim Peters772747b2001-08-09 22:21:55 +00001736 STORECHAR(0xFEFF);
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00001737 if (size == 0)
Marc-André Lemburgb7520772000-08-14 11:29:19 +00001738 return v;
Tim Peters772747b2001-08-09 22:21:55 +00001739
1740 if (byteorder == -1) {
1741 /* force LE */
1742 ihi = 1;
1743 ilo = 0;
1744 }
1745 else if (byteorder == 1) {
1746 /* force BE */
1747 ihi = 0;
1748 ilo = 1;
1749 }
1750
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001751 while (size-- > 0) {
1752 Py_UNICODE ch = *s++;
1753 Py_UNICODE ch2 = 0;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00001754#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001755 if (ch >= 0x10000) {
Tim Peters772747b2001-08-09 22:21:55 +00001756 ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF);
1757 ch = 0xD800 | ((ch-0x10000) >> 10);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001758 }
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00001759#endif
Tim Peters772747b2001-08-09 22:21:55 +00001760 STORECHAR(ch);
1761 if (ch2)
1762 STORECHAR(ch2);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001763 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001764 return v;
Tim Peters772747b2001-08-09 22:21:55 +00001765#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00001766}
1767
1768PyObject *PyUnicode_AsUTF16String(PyObject *unicode)
1769{
1770 if (!PyUnicode_Check(unicode)) {
1771 PyErr_BadArgument();
1772 return NULL;
1773 }
1774 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
1775 PyUnicode_GET_SIZE(unicode),
1776 NULL,
1777 0);
1778}
1779
1780/* --- Unicode Escape Codec ----------------------------------------------- */
1781
Fredrik Lundh06d12682001-01-24 07:59:11 +00001782static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00001783
Guido van Rossumd57fd912000-03-10 22:53:23 +00001784PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001785 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00001786 const char *errors)
1787{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001788 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001789 Py_ssize_t startinpos;
1790 Py_ssize_t endinpos;
1791 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001792 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001793 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001794 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001795 const char *end;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001796 char* message;
1797 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001798 PyObject *errorHandler = NULL;
1799 PyObject *exc = NULL;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001800
Guido van Rossumd57fd912000-03-10 22:53:23 +00001801 /* Escaped strings will always be longer than the resulting
1802 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001803 length after conversion to the true value.
1804 (but if the error callback returns a long replacement string
1805 we'll have to allocate more space) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001806 v = _PyUnicode_New(size);
1807 if (v == NULL)
1808 goto onError;
1809 if (size == 0)
1810 return (PyObject *)v;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001811
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001812 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001813 end = s + size;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001814
Guido van Rossumd57fd912000-03-10 22:53:23 +00001815 while (s < end) {
1816 unsigned char c;
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00001817 Py_UNICODE x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001818 int digits;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001819
1820 /* Non-escape characters are interpreted as Unicode ordinals */
1821 if (*s != '\\') {
Fredrik Lundhccc74732001-02-18 22:13:49 +00001822 *p++ = (unsigned char) *s++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001823 continue;
1824 }
1825
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001826 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001827 /* \ - Escapes */
1828 s++;
1829 switch (*s++) {
1830
1831 /* \x escapes */
1832 case '\n': break;
1833 case '\\': *p++ = '\\'; break;
1834 case '\'': *p++ = '\''; break;
1835 case '\"': *p++ = '\"'; break;
1836 case 'b': *p++ = '\b'; break;
1837 case 'f': *p++ = '\014'; break; /* FF */
1838 case 't': *p++ = '\t'; break;
1839 case 'n': *p++ = '\n'; break;
1840 case 'r': *p++ = '\r'; break;
1841 case 'v': *p++ = '\013'; break; /* VT */
1842 case 'a': *p++ = '\007'; break; /* BEL, not classic C */
1843
1844 /* \OOO (octal) escapes */
1845 case '0': case '1': case '2': case '3':
1846 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001847 x = s[-1] - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00001848 if ('0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001849 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00001850 if ('0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001851 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00001852 }
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001853 *p++ = x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001854 break;
1855
Fredrik Lundhccc74732001-02-18 22:13:49 +00001856 /* hex escapes */
1857 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001858 case 'x':
Fredrik Lundhccc74732001-02-18 22:13:49 +00001859 digits = 2;
1860 message = "truncated \\xXX escape";
1861 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001862
Fredrik Lundhccc74732001-02-18 22:13:49 +00001863 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001864 case 'u':
Fredrik Lundhccc74732001-02-18 22:13:49 +00001865 digits = 4;
1866 message = "truncated \\uXXXX escape";
1867 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001868
Fredrik Lundhccc74732001-02-18 22:13:49 +00001869 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00001870 case 'U':
Fredrik Lundhccc74732001-02-18 22:13:49 +00001871 digits = 8;
1872 message = "truncated \\UXXXXXXXX escape";
1873 hexescape:
1874 chr = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001875 outpos = p-PyUnicode_AS_UNICODE(v);
1876 if (s+digits>end) {
1877 endinpos = size;
1878 if (unicode_decode_call_errorhandler(
1879 errors, &errorHandler,
1880 "unicodeescape", "end of string in escape sequence",
1881 starts, size, &startinpos, &endinpos, &exc, &s,
1882 (PyObject **)&v, &outpos, &p))
1883 goto onError;
1884 goto nextByte;
1885 }
1886 for (i = 0; i < digits; ++i) {
Fredrik Lundhccc74732001-02-18 22:13:49 +00001887 c = (unsigned char) s[i];
Fredrik Lundhdf846752000-09-03 11:29:49 +00001888 if (!isxdigit(c)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001889 endinpos = (s+i+1)-starts;
1890 if (unicode_decode_call_errorhandler(
1891 errors, &errorHandler,
1892 "unicodeescape", message,
1893 starts, size, &startinpos, &endinpos, &exc, &s,
1894 (PyObject **)&v, &outpos, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00001895 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001896 goto nextByte;
Fredrik Lundhdf846752000-09-03 11:29:49 +00001897 }
1898 chr = (chr<<4) & ~0xF;
1899 if (c >= '0' && c <= '9')
1900 chr += c - '0';
1901 else if (c >= 'a' && c <= 'f')
1902 chr += 10 + c - 'a';
1903 else
1904 chr += 10 + c - 'A';
1905 }
1906 s += i;
Jeremy Hylton504de6b2003-10-06 05:08:26 +00001907 if (chr == 0xffffffff && PyErr_Occurred())
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001908 /* _decoding_error will have already written into the
1909 target buffer. */
1910 break;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001911 store:
Fredrik Lundhdf846752000-09-03 11:29:49 +00001912 /* when we get here, chr is a 32-bit unicode character */
1913 if (chr <= 0xffff)
1914 /* UCS-2 character */
1915 *p++ = (Py_UNICODE) chr;
1916 else if (chr <= 0x10ffff) {
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001917 /* UCS-4 character. Either store directly, or as
Walter Dörwald8c077222002-03-25 11:16:18 +00001918 surrogate pair. */
Fredrik Lundh8f455852001-06-27 18:59:43 +00001919#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001920 *p++ = chr;
1921#else
Fredrik Lundhdf846752000-09-03 11:29:49 +00001922 chr -= 0x10000L;
1923 *p++ = 0xD800 + (Py_UNICODE) (chr >> 10);
Fredrik Lundh45714e92001-06-26 16:39:36 +00001924 *p++ = 0xDC00 + (Py_UNICODE) (chr & 0x03FF);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001925#endif
Fredrik Lundhdf846752000-09-03 11:29:49 +00001926 } else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001927 endinpos = s-starts;
1928 outpos = p-PyUnicode_AS_UNICODE(v);
1929 if (unicode_decode_call_errorhandler(
1930 errors, &errorHandler,
1931 "unicodeescape", "illegal Unicode character",
1932 starts, size, &startinpos, &endinpos, &exc, &s,
1933 (PyObject **)&v, &outpos, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00001934 goto onError;
1935 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00001936 break;
1937
1938 /* \N{name} */
1939 case 'N':
1940 message = "malformed \\N character escape";
1941 if (ucnhash_CAPI == NULL) {
1942 /* load the unicode data module */
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00001943 PyObject *m, *api;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001944 m = PyImport_ImportModule("unicodedata");
1945 if (m == NULL)
1946 goto ucnhashError;
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00001947 api = PyObject_GetAttrString(m, "ucnhash_CAPI");
Fredrik Lundhccc74732001-02-18 22:13:49 +00001948 Py_DECREF(m);
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00001949 if (api == NULL)
Fredrik Lundhccc74732001-02-18 22:13:49 +00001950 goto ucnhashError;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001951 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCObject_AsVoidPtr(api);
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00001952 Py_DECREF(api);
Fredrik Lundhccc74732001-02-18 22:13:49 +00001953 if (ucnhash_CAPI == NULL)
1954 goto ucnhashError;
1955 }
1956 if (*s == '{') {
1957 const char *start = s+1;
1958 /* look for the closing brace */
1959 while (*s != '}' && s < end)
1960 s++;
1961 if (s > start && s < end && *s == '}') {
1962 /* found a name. look it up in the unicode database */
1963 message = "unknown Unicode character name";
1964 s++;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +00001965 if (ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1), &chr))
Fredrik Lundhccc74732001-02-18 22:13:49 +00001966 goto store;
1967 }
1968 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001969 endinpos = s-starts;
1970 outpos = p-PyUnicode_AS_UNICODE(v);
1971 if (unicode_decode_call_errorhandler(
1972 errors, &errorHandler,
1973 "unicodeescape", message,
1974 starts, size, &startinpos, &endinpos, &exc, &s,
1975 (PyObject **)&v, &outpos, &p))
Fredrik Lundhccc74732001-02-18 22:13:49 +00001976 goto onError;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001977 break;
1978
1979 default:
Walter Dörwald8c077222002-03-25 11:16:18 +00001980 if (s > end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001981 message = "\\ at end of string";
1982 s--;
1983 endinpos = s-starts;
1984 outpos = p-PyUnicode_AS_UNICODE(v);
1985 if (unicode_decode_call_errorhandler(
1986 errors, &errorHandler,
1987 "unicodeescape", message,
1988 starts, size, &startinpos, &endinpos, &exc, &s,
1989 (PyObject **)&v, &outpos, &p))
Walter Dörwald8c077222002-03-25 11:16:18 +00001990 goto onError;
1991 }
1992 else {
1993 *p++ = '\\';
1994 *p++ = (unsigned char)s[-1];
1995 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00001996 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001997 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001998 nextByte:
1999 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002000 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002001 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002002 goto onError;
Walter Dörwaldd4ade082003-08-15 15:00:26 +00002003 Py_XDECREF(errorHandler);
2004 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002005 return (PyObject *)v;
Walter Dörwald8c077222002-03-25 11:16:18 +00002006
Fredrik Lundhccc74732001-02-18 22:13:49 +00002007ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00002008 PyErr_SetString(
2009 PyExc_UnicodeError,
2010 "\\N escapes not supported (can't load unicodedata module)"
2011 );
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00002012 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002013 Py_XDECREF(errorHandler);
2014 Py_XDECREF(exc);
Fredrik Lundhf6056062001-01-20 11:15:25 +00002015 return NULL;
2016
Fredrik Lundhccc74732001-02-18 22:13:49 +00002017onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00002018 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002019 Py_XDECREF(errorHandler);
2020 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002021 return NULL;
2022}
2023
2024/* Return a Unicode-Escape string version of the Unicode object.
2025
2026 If quotes is true, the string is enclosed in u"" or u'' quotes as
2027 appropriate.
2028
2029*/
2030
Thomas Wouters477c8d52006-05-27 19:21:47 +00002031Py_LOCAL_INLINE(const Py_UNICODE *) findchar(const Py_UNICODE *s,
2032 Py_ssize_t size,
2033 Py_UNICODE ch)
2034{
2035 /* like wcschr, but doesn't stop at NULL characters */
2036
2037 while (size-- > 0) {
2038 if (*s == ch)
2039 return s;
2040 s++;
2041 }
2042
2043 return NULL;
2044}
Barry Warsaw51ac5802000-03-20 16:36:48 +00002045
Guido van Rossumd57fd912000-03-10 22:53:23 +00002046static
2047PyObject *unicodeescape_string(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002048 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00002049 int quotes)
2050{
2051 PyObject *repr;
2052 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002053
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00002054 static const char *hexdigit = "0123456789abcdef";
Guido van Rossumd57fd912000-03-10 22:53:23 +00002055
Thomas Wouters89f507f2006-12-13 04:49:30 +00002056 /* XXX(nnorwitz): rather than over-allocating, it would be
2057 better to choose a different scheme. Perhaps scan the
2058 first N-chars of the string and allocate based on that size.
2059 */
2060 /* Initial allocation is based on the longest-possible unichr
2061 escape.
2062
2063 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
2064 unichr, so in this case it's the longest unichr escape. In
2065 narrow (UTF-16) builds this is five chars per source unichr
2066 since there are two unichrs in the surrogate pair, so in narrow
2067 (UTF-16) builds it's not the longest unichr escape.
2068
2069 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
2070 so in the narrow (UTF-16) build case it's the longest unichr
2071 escape.
2072 */
2073
2074 repr = PyString_FromStringAndSize(NULL,
2075 2
2076#ifdef Py_UNICODE_WIDE
2077 + 10*size
2078#else
2079 + 6*size
2080#endif
2081 + 1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002082 if (repr == NULL)
2083 return NULL;
2084
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002085 p = PyString_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002086
2087 if (quotes) {
Tim Petersced69f82003-09-16 20:30:58 +00002088 *p++ = (findchar(s, size, '\'') &&
Guido van Rossumd57fd912000-03-10 22:53:23 +00002089 !findchar(s, size, '"')) ? '"' : '\'';
2090 }
2091 while (size-- > 0) {
2092 Py_UNICODE ch = *s++;
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002093
Hye-Shik Chang835b2432005-12-17 04:38:31 +00002094 /* Escape quotes and backslashes */
2095 if ((quotes &&
Guido van Rossum572dbf82007-04-27 23:53:51 +00002096 ch == (Py_UNICODE) PyString_AS_STRING(repr)[0]) || ch == '\\') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002097 *p++ = '\\';
2098 *p++ = (char) ch;
Guido van Rossumad9744a2001-09-21 15:38:17 +00002099 continue;
Tim Petersced69f82003-09-16 20:30:58 +00002100 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002101
Guido van Rossum0d42e0c2001-07-20 16:36:21 +00002102#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002103 /* Map 21-bit characters to '\U00xxxxxx' */
2104 else if (ch >= 0x10000) {
2105 *p++ = '\\';
2106 *p++ = 'U';
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002107 *p++ = hexdigit[(ch >> 28) & 0x0000000F];
2108 *p++ = hexdigit[(ch >> 24) & 0x0000000F];
2109 *p++ = hexdigit[(ch >> 20) & 0x0000000F];
2110 *p++ = hexdigit[(ch >> 16) & 0x0000000F];
2111 *p++ = hexdigit[(ch >> 12) & 0x0000000F];
2112 *p++ = hexdigit[(ch >> 8) & 0x0000000F];
2113 *p++ = hexdigit[(ch >> 4) & 0x0000000F];
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002114 *p++ = hexdigit[ch & 0x0000000F];
2115 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002116 }
Thomas Wouters89f507f2006-12-13 04:49:30 +00002117#else
2118 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002119 else if (ch >= 0xD800 && ch < 0xDC00) {
2120 Py_UNICODE ch2;
2121 Py_UCS4 ucs;
Tim Petersced69f82003-09-16 20:30:58 +00002122
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002123 ch2 = *s++;
2124 size--;
2125 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
2126 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
2127 *p++ = '\\';
2128 *p++ = 'U';
2129 *p++ = hexdigit[(ucs >> 28) & 0x0000000F];
2130 *p++ = hexdigit[(ucs >> 24) & 0x0000000F];
2131 *p++ = hexdigit[(ucs >> 20) & 0x0000000F];
2132 *p++ = hexdigit[(ucs >> 16) & 0x0000000F];
2133 *p++ = hexdigit[(ucs >> 12) & 0x0000000F];
2134 *p++ = hexdigit[(ucs >> 8) & 0x0000000F];
2135 *p++ = hexdigit[(ucs >> 4) & 0x0000000F];
2136 *p++ = hexdigit[ucs & 0x0000000F];
2137 continue;
2138 }
2139 /* Fall through: isolated surrogates are copied as-is */
2140 s--;
2141 size++;
2142 }
Thomas Wouters89f507f2006-12-13 04:49:30 +00002143#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002144
Guido van Rossumd57fd912000-03-10 22:53:23 +00002145 /* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002146 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002147 *p++ = '\\';
2148 *p++ = 'u';
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002149 *p++ = hexdigit[(ch >> 12) & 0x000F];
2150 *p++ = hexdigit[(ch >> 8) & 0x000F];
2151 *p++ = hexdigit[(ch >> 4) & 0x000F];
2152 *p++ = hexdigit[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00002153 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002154
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00002155 /* Map special whitespace to '\t', \n', '\r' */
2156 else if (ch == '\t') {
2157 *p++ = '\\';
2158 *p++ = 't';
2159 }
2160 else if (ch == '\n') {
2161 *p++ = '\\';
2162 *p++ = 'n';
2163 }
2164 else if (ch == '\r') {
2165 *p++ = '\\';
2166 *p++ = 'r';
2167 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002168
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00002169 /* Map non-printable US ASCII to '\xhh' */
Marc-André Lemburg11326de2001-11-28 12:56:20 +00002170 else if (ch < ' ' || ch >= 0x7F) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002171 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00002172 *p++ = 'x';
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002173 *p++ = hexdigit[(ch >> 4) & 0x000F];
2174 *p++ = hexdigit[ch & 0x000F];
Tim Petersced69f82003-09-16 20:30:58 +00002175 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002176
Guido van Rossumd57fd912000-03-10 22:53:23 +00002177 /* Copy everything else as-is */
2178 else
2179 *p++ = (char) ch;
2180 }
2181 if (quotes)
Guido van Rossum572dbf82007-04-27 23:53:51 +00002182 *p++ = PyString_AS_STRING(repr)[0];
Guido van Rossumd57fd912000-03-10 22:53:23 +00002183
2184 *p = '\0';
Tim Peters5de98422002-04-27 18:44:32 +00002185 _PyString_Resize(&repr, p - PyString_AS_STRING(repr));
Guido van Rossumd57fd912000-03-10 22:53:23 +00002186 return repr;
2187}
2188
2189PyObject *PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002190 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002191{
2192 return unicodeescape_string(s, size, 0);
2193}
2194
2195PyObject *PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
2196{
2197 if (!PyUnicode_Check(unicode)) {
2198 PyErr_BadArgument();
2199 return NULL;
2200 }
2201 return PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
2202 PyUnicode_GET_SIZE(unicode));
2203}
2204
2205/* --- Raw Unicode Escape Codec ------------------------------------------- */
2206
2207PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002208 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00002209 const char *errors)
2210{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002211 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002212 Py_ssize_t startinpos;
2213 Py_ssize_t endinpos;
2214 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002215 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002216 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002217 const char *end;
2218 const char *bs;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002219 PyObject *errorHandler = NULL;
2220 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00002221
Guido van Rossumd57fd912000-03-10 22:53:23 +00002222 /* Escaped strings will always be longer than the resulting
2223 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002224 length after conversion to the true value. (But decoding error
2225 handler might have to resize the string) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002226 v = _PyUnicode_New(size);
2227 if (v == NULL)
2228 goto onError;
2229 if (size == 0)
2230 return (PyObject *)v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002231 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002232 end = s + size;
2233 while (s < end) {
2234 unsigned char c;
Martin v. Löwis047c05e2002-03-21 08:55:28 +00002235 Py_UCS4 x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002236 int i;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002237 int count;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002238
2239 /* Non-escape characters are interpreted as Unicode ordinals */
2240 if (*s != '\\') {
2241 *p++ = (unsigned char)*s++;
2242 continue;
2243 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002244 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002245
2246 /* \u-escapes are only interpreted iff the number of leading
2247 backslashes if odd */
2248 bs = s;
2249 for (;s < end;) {
2250 if (*s != '\\')
2251 break;
2252 *p++ = (unsigned char)*s++;
2253 }
2254 if (((s - bs) & 1) == 0 ||
2255 s >= end ||
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002256 (*s != 'u' && *s != 'U')) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002257 continue;
2258 }
2259 p--;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002260 count = *s=='u' ? 4 : 8;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002261 s++;
2262
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002263 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002264 outpos = p-PyUnicode_AS_UNICODE(v);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002265 for (x = 0, i = 0; i < count; ++i, ++s) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002266 c = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002267 if (!isxdigit(c)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002268 endinpos = s-starts;
2269 if (unicode_decode_call_errorhandler(
2270 errors, &errorHandler,
2271 "rawunicodeescape", "truncated \\uXXXX",
2272 starts, size, &startinpos, &endinpos, &exc, &s,
2273 (PyObject **)&v, &outpos, &p))
Guido van Rossumd57fd912000-03-10 22:53:23 +00002274 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002275 goto nextByte;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002276 }
2277 x = (x<<4) & ~0xF;
2278 if (c >= '0' && c <= '9')
2279 x += c - '0';
2280 else if (c >= 'a' && c <= 'f')
2281 x += 10 + c - 'a';
2282 else
2283 x += 10 + c - 'A';
2284 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002285#ifndef Py_UNICODE_WIDE
2286 if (x > 0x10000) {
2287 if (unicode_decode_call_errorhandler(
2288 errors, &errorHandler,
2289 "rawunicodeescape", "\\Uxxxxxxxx out of range",
2290 starts, size, &startinpos, &endinpos, &exc, &s,
2291 (PyObject **)&v, &outpos, &p))
2292 goto onError;
2293 }
2294#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002295 *p++ = x;
2296 nextByte:
2297 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002298 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002299 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002300 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002301 Py_XDECREF(errorHandler);
2302 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002303 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00002304
Guido van Rossumd57fd912000-03-10 22:53:23 +00002305 onError:
2306 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002307 Py_XDECREF(errorHandler);
2308 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002309 return NULL;
2310}
2311
2312PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002313 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002314{
2315 PyObject *repr;
2316 char *p;
2317 char *q;
2318
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00002319 static const char *hexdigit = "0123456789abcdef";
Guido van Rossumd57fd912000-03-10 22:53:23 +00002320
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002321#ifdef Py_UNICODE_WIDE
2322 repr = PyString_FromStringAndSize(NULL, 10 * size);
2323#else
Guido van Rossumd57fd912000-03-10 22:53:23 +00002324 repr = PyString_FromStringAndSize(NULL, 6 * size);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002325#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00002326 if (repr == NULL)
2327 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00002328 if (size == 0)
2329 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002330
2331 p = q = PyString_AS_STRING(repr);
2332 while (size-- > 0) {
2333 Py_UNICODE ch = *s++;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002334#ifdef Py_UNICODE_WIDE
2335 /* Map 32-bit characters to '\Uxxxxxxxx' */
2336 if (ch >= 0x10000) {
2337 *p++ = '\\';
2338 *p++ = 'U';
2339 *p++ = hexdigit[(ch >> 28) & 0xf];
2340 *p++ = hexdigit[(ch >> 24) & 0xf];
2341 *p++ = hexdigit[(ch >> 20) & 0xf];
2342 *p++ = hexdigit[(ch >> 16) & 0xf];
2343 *p++ = hexdigit[(ch >> 12) & 0xf];
2344 *p++ = hexdigit[(ch >> 8) & 0xf];
2345 *p++ = hexdigit[(ch >> 4) & 0xf];
2346 *p++ = hexdigit[ch & 15];
Tim Petersced69f82003-09-16 20:30:58 +00002347 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002348 else
2349#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00002350 /* Map 16-bit characters to '\uxxxx' */
2351 if (ch >= 256) {
2352 *p++ = '\\';
2353 *p++ = 'u';
2354 *p++ = hexdigit[(ch >> 12) & 0xf];
2355 *p++ = hexdigit[(ch >> 8) & 0xf];
2356 *p++ = hexdigit[(ch >> 4) & 0xf];
2357 *p++ = hexdigit[ch & 15];
2358 }
2359 /* Copy everything else as-is */
2360 else
2361 *p++ = (char) ch;
2362 }
2363 *p = '\0';
Tim Peters5de98422002-04-27 18:44:32 +00002364 _PyString_Resize(&repr, p - q);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002365 return repr;
2366}
2367
2368PyObject *PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
2369{
2370 if (!PyUnicode_Check(unicode)) {
2371 PyErr_BadArgument();
2372 return NULL;
2373 }
2374 return PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
2375 PyUnicode_GET_SIZE(unicode));
2376}
2377
Walter Dörwalda47d1c02005-08-30 10:23:14 +00002378/* --- Unicode Internal Codec ------------------------------------------- */
2379
2380PyObject *_PyUnicode_DecodeUnicodeInternal(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002381 Py_ssize_t size,
Walter Dörwalda47d1c02005-08-30 10:23:14 +00002382 const char *errors)
2383{
2384 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002385 Py_ssize_t startinpos;
2386 Py_ssize_t endinpos;
2387 Py_ssize_t outpos;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00002388 PyUnicodeObject *v;
2389 Py_UNICODE *p;
2390 const char *end;
2391 const char *reason;
2392 PyObject *errorHandler = NULL;
2393 PyObject *exc = NULL;
2394
Neal Norwitzd43069c2006-01-08 01:12:10 +00002395#ifdef Py_UNICODE_WIDE
2396 Py_UNICODE unimax = PyUnicode_GetMax();
2397#endif
2398
Thomas Wouters89f507f2006-12-13 04:49:30 +00002399 /* XXX overflow detection missing */
Walter Dörwalda47d1c02005-08-30 10:23:14 +00002400 v = _PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE);
2401 if (v == NULL)
2402 goto onError;
2403 if (PyUnicode_GetSize((PyObject *)v) == 0)
2404 return (PyObject *)v;
2405 p = PyUnicode_AS_UNICODE(v);
2406 end = s + size;
2407
2408 while (s < end) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00002409 memcpy(p, s, sizeof(Py_UNICODE));
Walter Dörwalda47d1c02005-08-30 10:23:14 +00002410 /* We have to sanity check the raw data, otherwise doom looms for
2411 some malformed UCS-4 data. */
2412 if (
2413 #ifdef Py_UNICODE_WIDE
2414 *p > unimax || *p < 0 ||
2415 #endif
2416 end-s < Py_UNICODE_SIZE
2417 )
2418 {
2419 startinpos = s - starts;
2420 if (end-s < Py_UNICODE_SIZE) {
2421 endinpos = end-starts;
2422 reason = "truncated input";
2423 }
2424 else {
2425 endinpos = s - starts + Py_UNICODE_SIZE;
2426 reason = "illegal code point (> 0x10FFFF)";
2427 }
2428 outpos = p - PyUnicode_AS_UNICODE(v);
2429 if (unicode_decode_call_errorhandler(
2430 errors, &errorHandler,
2431 "unicode_internal", reason,
2432 starts, size, &startinpos, &endinpos, &exc, &s,
2433 (PyObject **)&v, &outpos, &p)) {
2434 goto onError;
2435 }
2436 }
2437 else {
2438 p++;
2439 s += Py_UNICODE_SIZE;
2440 }
2441 }
2442
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002443 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00002444 goto onError;
2445 Py_XDECREF(errorHandler);
2446 Py_XDECREF(exc);
2447 return (PyObject *)v;
2448
2449 onError:
2450 Py_XDECREF(v);
2451 Py_XDECREF(errorHandler);
2452 Py_XDECREF(exc);
2453 return NULL;
2454}
2455
Guido van Rossumd57fd912000-03-10 22:53:23 +00002456/* --- Latin-1 Codec ------------------------------------------------------ */
2457
2458PyObject *PyUnicode_DecodeLatin1(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002459 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00002460 const char *errors)
2461{
2462 PyUnicodeObject *v;
2463 Py_UNICODE *p;
Tim Petersced69f82003-09-16 20:30:58 +00002464
Guido van Rossumd57fd912000-03-10 22:53:23 +00002465 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002466 if (size == 1) {
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002467 Py_UNICODE r = *(unsigned char*)s;
2468 return PyUnicode_FromUnicode(&r, 1);
2469 }
2470
Guido van Rossumd57fd912000-03-10 22:53:23 +00002471 v = _PyUnicode_New(size);
2472 if (v == NULL)
2473 goto onError;
2474 if (size == 0)
2475 return (PyObject *)v;
2476 p = PyUnicode_AS_UNICODE(v);
2477 while (size-- > 0)
2478 *p++ = (unsigned char)*s++;
2479 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00002480
Guido van Rossumd57fd912000-03-10 22:53:23 +00002481 onError:
2482 Py_XDECREF(v);
2483 return NULL;
2484}
2485
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002486/* create or adjust a UnicodeEncodeError */
2487static void make_encode_exception(PyObject **exceptionObject,
2488 const char *encoding,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002489 const Py_UNICODE *unicode, Py_ssize_t size,
2490 Py_ssize_t startpos, Py_ssize_t endpos,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002491 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002492{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002493 if (*exceptionObject == NULL) {
2494 *exceptionObject = PyUnicodeEncodeError_Create(
2495 encoding, unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002496 }
2497 else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002498 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
2499 goto onError;
2500 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
2501 goto onError;
2502 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
2503 goto onError;
2504 return;
2505 onError:
2506 Py_DECREF(*exceptionObject);
2507 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002508 }
2509}
2510
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002511/* raises a UnicodeEncodeError */
2512static void raise_encode_exception(PyObject **exceptionObject,
2513 const char *encoding,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002514 const Py_UNICODE *unicode, Py_ssize_t size,
2515 Py_ssize_t startpos, Py_ssize_t endpos,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002516 const char *reason)
2517{
2518 make_encode_exception(exceptionObject,
2519 encoding, unicode, size, startpos, endpos, reason);
2520 if (*exceptionObject != NULL)
2521 PyCodec_StrictErrors(*exceptionObject);
2522}
2523
2524/* error handling callback helper:
2525 build arguments, call the callback and check the arguments,
2526 put the result into newpos and return the replacement string, which
2527 has to be freed by the caller */
2528static PyObject *unicode_encode_call_errorhandler(const char *errors,
2529 PyObject **errorHandler,
2530 const char *encoding, const char *reason,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002531 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
2532 Py_ssize_t startpos, Py_ssize_t endpos,
2533 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002534{
Martin v. Löwis18e16552006-02-15 17:27:45 +00002535 static char *argparse = "O!n;encoding error handler must return (unicode, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002536
2537 PyObject *restuple;
2538 PyObject *resunicode;
2539
2540 if (*errorHandler == NULL) {
2541 *errorHandler = PyCodec_LookupError(errors);
2542 if (*errorHandler == NULL)
2543 return NULL;
2544 }
2545
2546 make_encode_exception(exceptionObject,
2547 encoding, unicode, size, startpos, endpos, reason);
2548 if (*exceptionObject == NULL)
2549 return NULL;
2550
2551 restuple = PyObject_CallFunctionObjArgs(
2552 *errorHandler, *exceptionObject, NULL);
2553 if (restuple == NULL)
2554 return NULL;
2555 if (!PyTuple_Check(restuple)) {
2556 PyErr_Format(PyExc_TypeError, &argparse[4]);
2557 Py_DECREF(restuple);
2558 return NULL;
2559 }
2560 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
2561 &resunicode, newpos)) {
2562 Py_DECREF(restuple);
2563 return NULL;
2564 }
2565 if (*newpos<0)
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00002566 *newpos = size+*newpos;
2567 if (*newpos<0 || *newpos>size) {
Martin v. Löwis2c95cc62006-02-16 06:54:25 +00002568 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00002569 Py_DECREF(restuple);
2570 return NULL;
2571 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002572 Py_INCREF(resunicode);
2573 Py_DECREF(restuple);
2574 return resunicode;
2575}
2576
2577static PyObject *unicode_encode_ucs1(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002578 Py_ssize_t size,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002579 const char *errors,
2580 int limit)
2581{
2582 /* output object */
2583 PyObject *res;
2584 /* pointers to the beginning and end+1 of input */
2585 const Py_UNICODE *startp = p;
2586 const Py_UNICODE *endp = p + size;
2587 /* pointer to the beginning of the unencodable characters */
2588 /* const Py_UNICODE *badp = NULL; */
2589 /* pointer into the output */
2590 char *str;
2591 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00002592 Py_ssize_t respos = 0;
2593 Py_ssize_t ressize;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002594 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
2595 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002596 PyObject *errorHandler = NULL;
2597 PyObject *exc = NULL;
2598 /* the following variable is used for caching string comparisons
2599 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
2600 int known_errorHandler = -1;
2601
2602 /* allocate enough for a simple encoding without
2603 replacements, if we need more, we'll resize */
Guido van Rossumf15a29f2007-05-04 00:41:39 +00002604 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002605 if (res == NULL)
2606 goto onError;
2607 if (size == 0)
2608 return res;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00002609 str = PyBytes_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002610 ressize = size;
2611
2612 while (p<endp) {
2613 Py_UNICODE c = *p;
2614
2615 /* can we encode this? */
2616 if (c<limit) {
2617 /* no overflow check, because we know that the space is enough */
2618 *str++ = (char)c;
2619 ++p;
2620 }
2621 else {
Martin v. Löwis18e16552006-02-15 17:27:45 +00002622 Py_ssize_t unicodepos = p-startp;
2623 Py_ssize_t requiredsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002624 PyObject *repunicode;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002625 Py_ssize_t repsize;
2626 Py_ssize_t newpos;
2627 Py_ssize_t respos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002628 Py_UNICODE *uni2;
2629 /* startpos for collecting unencodable chars */
2630 const Py_UNICODE *collstart = p;
2631 const Py_UNICODE *collend = p;
2632 /* find all unecodable characters */
2633 while ((collend < endp) && ((*collend)>=limit))
2634 ++collend;
2635 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
2636 if (known_errorHandler==-1) {
2637 if ((errors==NULL) || (!strcmp(errors, "strict")))
2638 known_errorHandler = 1;
2639 else if (!strcmp(errors, "replace"))
2640 known_errorHandler = 2;
2641 else if (!strcmp(errors, "ignore"))
2642 known_errorHandler = 3;
2643 else if (!strcmp(errors, "xmlcharrefreplace"))
2644 known_errorHandler = 4;
2645 else
2646 known_errorHandler = 0;
2647 }
2648 switch (known_errorHandler) {
2649 case 1: /* strict */
2650 raise_encode_exception(&exc, encoding, startp, size, collstart-startp, collend-startp, reason);
2651 goto onError;
2652 case 2: /* replace */
2653 while (collstart++<collend)
2654 *str++ = '?'; /* fall through */
2655 case 3: /* ignore */
2656 p = collend;
2657 break;
2658 case 4: /* xmlcharrefreplace */
Guido van Rossumf15a29f2007-05-04 00:41:39 +00002659 respos = str - PyBytes_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002660 /* determine replacement size (temporarily (mis)uses p) */
2661 for (p = collstart, repsize = 0; p < collend; ++p) {
2662 if (*p<10)
2663 repsize += 2+1+1;
2664 else if (*p<100)
2665 repsize += 2+2+1;
2666 else if (*p<1000)
2667 repsize += 2+3+1;
2668 else if (*p<10000)
2669 repsize += 2+4+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00002670#ifndef Py_UNICODE_WIDE
2671 else
2672 repsize += 2+5+1;
2673#else
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002674 else if (*p<100000)
2675 repsize += 2+5+1;
2676 else if (*p<1000000)
2677 repsize += 2+6+1;
2678 else
2679 repsize += 2+7+1;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002680#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002681 }
2682 requiredsize = respos+repsize+(endp-collend);
2683 if (requiredsize > ressize) {
2684 if (requiredsize<2*ressize)
2685 requiredsize = 2*ressize;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00002686 if (PyBytes_Resize(res, requiredsize))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002687 goto onError;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00002688 str = PyBytes_AS_STRING(res) + respos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002689 ressize = requiredsize;
2690 }
2691 /* generate replacement (temporarily (mis)uses p) */
2692 for (p = collstart; p < collend; ++p) {
2693 str += sprintf(str, "&#%d;", (int)*p);
2694 }
2695 p = collend;
2696 break;
2697 default:
2698 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
2699 encoding, reason, startp, size, &exc,
2700 collstart-startp, collend-startp, &newpos);
2701 if (repunicode == NULL)
2702 goto onError;
2703 /* need more space? (at least enough for what we
2704 have+the replacement+the rest of the string, so
2705 we won't have to check space for encodable characters) */
Guido van Rossumf15a29f2007-05-04 00:41:39 +00002706 respos = str - PyBytes_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002707 repsize = PyUnicode_GET_SIZE(repunicode);
2708 requiredsize = respos+repsize+(endp-collend);
2709 if (requiredsize > ressize) {
2710 if (requiredsize<2*ressize)
2711 requiredsize = 2*ressize;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00002712 if (PyBytes_Resize(res, requiredsize)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002713 Py_DECREF(repunicode);
2714 goto onError;
2715 }
Guido van Rossumf15a29f2007-05-04 00:41:39 +00002716 str = PyBytes_AS_STRING(res) + respos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002717 ressize = requiredsize;
2718 }
2719 /* check if there is anything unencodable in the replacement
2720 and copy it to the output */
2721 for (uni2 = PyUnicode_AS_UNICODE(repunicode);repsize-->0; ++uni2, ++str) {
2722 c = *uni2;
2723 if (c >= limit) {
2724 raise_encode_exception(&exc, encoding, startp, size,
2725 unicodepos, unicodepos+1, reason);
2726 Py_DECREF(repunicode);
2727 goto onError;
2728 }
2729 *str = (char)c;
2730 }
2731 p = startp + newpos;
2732 Py_DECREF(repunicode);
2733 }
2734 }
2735 }
2736 /* Resize if we allocated to much */
Guido van Rossumf15a29f2007-05-04 00:41:39 +00002737 respos = str - PyBytes_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002738 if (respos<ressize)
2739 /* If this falls res will be NULL */
Guido van Rossumf15a29f2007-05-04 00:41:39 +00002740 PyBytes_Resize(res, respos);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002741 Py_XDECREF(errorHandler);
2742 Py_XDECREF(exc);
2743 return res;
2744
2745 onError:
2746 Py_XDECREF(res);
2747 Py_XDECREF(errorHandler);
2748 Py_XDECREF(exc);
2749 return NULL;
2750}
2751
Guido van Rossumd57fd912000-03-10 22:53:23 +00002752PyObject *PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002753 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00002754 const char *errors)
2755{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002756 return unicode_encode_ucs1(p, size, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002757}
2758
2759PyObject *PyUnicode_AsLatin1String(PyObject *unicode)
2760{
2761 if (!PyUnicode_Check(unicode)) {
2762 PyErr_BadArgument();
2763 return NULL;
2764 }
2765 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
2766 PyUnicode_GET_SIZE(unicode),
2767 NULL);
2768}
2769
2770/* --- 7-bit ASCII Codec -------------------------------------------------- */
2771
Guido van Rossumd57fd912000-03-10 22:53:23 +00002772PyObject *PyUnicode_DecodeASCII(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002773 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00002774 const char *errors)
2775{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002776 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002777 PyUnicodeObject *v;
2778 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002779 Py_ssize_t startinpos;
2780 Py_ssize_t endinpos;
2781 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002782 const char *e;
2783 PyObject *errorHandler = NULL;
2784 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00002785
Guido van Rossumd57fd912000-03-10 22:53:23 +00002786 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002787 if (size == 1 && *(unsigned char*)s < 128) {
2788 Py_UNICODE r = *(unsigned char*)s;
2789 return PyUnicode_FromUnicode(&r, 1);
2790 }
Tim Petersced69f82003-09-16 20:30:58 +00002791
Guido van Rossumd57fd912000-03-10 22:53:23 +00002792 v = _PyUnicode_New(size);
2793 if (v == NULL)
2794 goto onError;
2795 if (size == 0)
2796 return (PyObject *)v;
2797 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002798 e = s + size;
2799 while (s < e) {
2800 register unsigned char c = (unsigned char)*s;
2801 if (c < 128) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002802 *p++ = c;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002803 ++s;
2804 }
2805 else {
2806 startinpos = s-starts;
2807 endinpos = startinpos + 1;
Jeremy Hyltond8082792003-09-16 19:41:39 +00002808 outpos = p - (Py_UNICODE *)PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002809 if (unicode_decode_call_errorhandler(
2810 errors, &errorHandler,
2811 "ascii", "ordinal not in range(128)",
2812 starts, size, &startinpos, &endinpos, &exc, &s,
2813 (PyObject **)&v, &outpos, &p))
Guido van Rossumd57fd912000-03-10 22:53:23 +00002814 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002815 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002816 }
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002817 if (p - PyUnicode_AS_UNICODE(v) < PyString_GET_SIZE(v))
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002818 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002819 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002820 Py_XDECREF(errorHandler);
2821 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002822 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00002823
Guido van Rossumd57fd912000-03-10 22:53:23 +00002824 onError:
2825 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002826 Py_XDECREF(errorHandler);
2827 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002828 return NULL;
2829}
2830
Guido van Rossumd57fd912000-03-10 22:53:23 +00002831PyObject *PyUnicode_EncodeASCII(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002832 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00002833 const char *errors)
2834{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002835 return unicode_encode_ucs1(p, size, errors, 128);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002836}
2837
2838PyObject *PyUnicode_AsASCIIString(PyObject *unicode)
2839{
2840 if (!PyUnicode_Check(unicode)) {
2841 PyErr_BadArgument();
2842 return NULL;
2843 }
2844 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
2845 PyUnicode_GET_SIZE(unicode),
2846 NULL);
2847}
2848
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00002849#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Guido van Rossum2ea3e142000-03-31 17:24:09 +00002850
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002851/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00002852
Thomas Wouters0e3f5912006-08-11 14:57:12 +00002853#if SIZEOF_INT < SIZEOF_SSIZE_T
2854#define NEED_RETRY
2855#endif
2856
2857/* XXX This code is limited to "true" double-byte encodings, as
2858 a) it assumes an incomplete character consists of a single byte, and
2859 b) IsDBCSLeadByte (probably) does not work for non-DBCS multi-byte
2860 encodings, see IsDBCSLeadByteEx documentation. */
2861
2862static int is_dbcs_lead_byte(const char *s, int offset)
2863{
2864 const char *curr = s + offset;
2865
2866 if (IsDBCSLeadByte(*curr)) {
2867 const char *prev = CharPrev(s, curr);
2868 return (prev == curr) || !IsDBCSLeadByte(*prev) || (curr - prev == 2);
2869 }
2870 return 0;
2871}
2872
2873/*
2874 * Decode MBCS string into unicode object. If 'final' is set, converts
2875 * trailing lead-byte too. Returns consumed size if succeed, -1 otherwise.
2876 */
2877static int decode_mbcs(PyUnicodeObject **v,
2878 const char *s, /* MBCS string */
2879 int size, /* sizeof MBCS string */
2880 int final)
2881{
2882 Py_UNICODE *p;
2883 Py_ssize_t n = 0;
2884 int usize = 0;
2885
2886 assert(size >= 0);
2887
2888 /* Skip trailing lead-byte unless 'final' is set */
2889 if (!final && size >= 1 && is_dbcs_lead_byte(s, size - 1))
2890 --size;
2891
2892 /* First get the size of the result */
2893 if (size > 0) {
2894 usize = MultiByteToWideChar(CP_ACP, 0, s, size, NULL, 0);
2895 if (usize == 0) {
2896 PyErr_SetFromWindowsErrWithFilename(0, NULL);
2897 return -1;
2898 }
2899 }
2900
2901 if (*v == NULL) {
2902 /* Create unicode object */
2903 *v = _PyUnicode_New(usize);
2904 if (*v == NULL)
2905 return -1;
2906 }
2907 else {
2908 /* Extend unicode object */
2909 n = PyUnicode_GET_SIZE(*v);
2910 if (_PyUnicode_Resize(v, n + usize) < 0)
2911 return -1;
2912 }
2913
2914 /* Do the conversion */
2915 if (size > 0) {
2916 p = PyUnicode_AS_UNICODE(*v) + n;
2917 if (0 == MultiByteToWideChar(CP_ACP, 0, s, size, p, usize)) {
2918 PyErr_SetFromWindowsErrWithFilename(0, NULL);
2919 return -1;
2920 }
2921 }
2922
2923 return size;
2924}
2925
2926PyObject *PyUnicode_DecodeMBCSStateful(const char *s,
2927 Py_ssize_t size,
2928 const char *errors,
2929 Py_ssize_t *consumed)
2930{
2931 PyUnicodeObject *v = NULL;
2932 int done;
2933
2934 if (consumed)
2935 *consumed = 0;
2936
2937#ifdef NEED_RETRY
2938 retry:
2939 if (size > INT_MAX)
2940 done = decode_mbcs(&v, s, INT_MAX, 0);
2941 else
2942#endif
2943 done = decode_mbcs(&v, s, (int)size, !consumed);
2944
2945 if (done < 0) {
2946 Py_XDECREF(v);
2947 return NULL;
2948 }
2949
2950 if (consumed)
2951 *consumed += done;
2952
2953#ifdef NEED_RETRY
2954 if (size > INT_MAX) {
2955 s += done;
2956 size -= done;
2957 goto retry;
2958 }
2959#endif
2960
2961 return (PyObject *)v;
2962}
2963
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002964PyObject *PyUnicode_DecodeMBCS(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002965 Py_ssize_t size,
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002966 const char *errors)
2967{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00002968 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
2969}
2970
2971/*
2972 * Convert unicode into string object (MBCS).
2973 * Returns 0 if succeed, -1 otherwise.
2974 */
2975static int encode_mbcs(PyObject **repr,
2976 const Py_UNICODE *p, /* unicode */
2977 int size) /* size of unicode */
2978{
2979 int mbcssize = 0;
2980 Py_ssize_t n = 0;
2981
2982 assert(size >= 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002983
2984 /* First get the size of the result */
Thomas Wouters0e3f5912006-08-11 14:57:12 +00002985 if (size > 0) {
2986 mbcssize = WideCharToMultiByte(CP_ACP, 0, p, size, NULL, 0, NULL, NULL);
2987 if (mbcssize == 0) {
2988 PyErr_SetFromWindowsErrWithFilename(0, NULL);
2989 return -1;
2990 }
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002991 }
2992
Thomas Wouters0e3f5912006-08-11 14:57:12 +00002993 if (*repr == NULL) {
2994 /* Create string object */
Guido van Rossumf15a29f2007-05-04 00:41:39 +00002995 *repr = PyBytes_FromStringAndSize(NULL, mbcssize);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00002996 if (*repr == NULL)
2997 return -1;
2998 }
2999 else {
3000 /* Extend string object */
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003001 n = PyBytes_Size(*repr);
3002 if (PyBytes_Resize(*repr, n + mbcssize) < 0)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003003 return -1;
3004 }
3005
3006 /* Do the conversion */
3007 if (size > 0) {
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003008 char *s = PyBytes_AS_STRING(*repr) + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003009 if (0 == WideCharToMultiByte(CP_ACP, 0, p, size, s, mbcssize, NULL, NULL)) {
3010 PyErr_SetFromWindowsErrWithFilename(0, NULL);
3011 return -1;
3012 }
3013 }
3014
3015 return 0;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003016}
3017
3018PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003019 Py_ssize_t size,
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003020 const char *errors)
3021{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003022 PyObject *repr = NULL;
3023 int ret;
Guido van Rossum03e29f12000-05-04 15:52:20 +00003024
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003025#ifdef NEED_RETRY
3026 retry:
3027 if (size > INT_MAX)
3028 ret = encode_mbcs(&repr, p, INT_MAX);
3029 else
3030#endif
3031 ret = encode_mbcs(&repr, p, (int)size);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003032
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003033 if (ret < 0) {
3034 Py_XDECREF(repr);
3035 return NULL;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003036 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003037
3038#ifdef NEED_RETRY
3039 if (size > INT_MAX) {
3040 p += INT_MAX;
3041 size -= INT_MAX;
3042 goto retry;
3043 }
3044#endif
3045
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003046 return repr;
3047}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00003048
Mark Hammond0ccda1e2003-07-01 00:13:27 +00003049PyObject *PyUnicode_AsMBCSString(PyObject *unicode)
3050{
3051 if (!PyUnicode_Check(unicode)) {
3052 PyErr_BadArgument();
3053 return NULL;
3054 }
3055 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
3056 PyUnicode_GET_SIZE(unicode),
3057 NULL);
3058}
3059
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003060#undef NEED_RETRY
3061
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00003062#endif /* MS_WINDOWS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003063
Guido van Rossumd57fd912000-03-10 22:53:23 +00003064/* --- Character Mapping Codec -------------------------------------------- */
3065
Guido van Rossumd57fd912000-03-10 22:53:23 +00003066PyObject *PyUnicode_DecodeCharmap(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003067 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003068 PyObject *mapping,
3069 const char *errors)
3070{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003071 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003072 Py_ssize_t startinpos;
3073 Py_ssize_t endinpos;
3074 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003075 const char *e;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003076 PyUnicodeObject *v;
3077 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003078 Py_ssize_t extrachars = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003079 PyObject *errorHandler = NULL;
3080 PyObject *exc = NULL;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003081 Py_UNICODE *mapstring = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003082 Py_ssize_t maplen = 0;
Tim Petersced69f82003-09-16 20:30:58 +00003083
Guido van Rossumd57fd912000-03-10 22:53:23 +00003084 /* Default to Latin-1 */
3085 if (mapping == NULL)
3086 return PyUnicode_DecodeLatin1(s, size, errors);
3087
3088 v = _PyUnicode_New(size);
3089 if (v == NULL)
3090 goto onError;
3091 if (size == 0)
3092 return (PyObject *)v;
3093 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003094 e = s + size;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003095 if (PyUnicode_CheckExact(mapping)) {
3096 mapstring = PyUnicode_AS_UNICODE(mapping);
3097 maplen = PyUnicode_GET_SIZE(mapping);
3098 while (s < e) {
3099 unsigned char ch = *s;
3100 Py_UNICODE x = 0xfffe; /* illegal value */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003101
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003102 if (ch < maplen)
3103 x = mapstring[ch];
Guido van Rossumd57fd912000-03-10 22:53:23 +00003104
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003105 if (x == 0xfffe) {
3106 /* undefined mapping */
3107 outpos = p-PyUnicode_AS_UNICODE(v);
3108 startinpos = s-starts;
3109 endinpos = startinpos+1;
3110 if (unicode_decode_call_errorhandler(
3111 errors, &errorHandler,
3112 "charmap", "character maps to <undefined>",
3113 starts, size, &startinpos, &endinpos, &exc, &s,
3114 (PyObject **)&v, &outpos, &p)) {
3115 goto onError;
Marc-André Lemburgec233e52001-01-06 14:59:58 +00003116 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003117 continue;
Marc-André Lemburgec233e52001-01-06 14:59:58 +00003118 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003119 *p++ = x;
3120 ++s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003121 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003122 }
3123 else {
3124 while (s < e) {
3125 unsigned char ch = *s;
3126 PyObject *w, *x;
3127
3128 /* Get mapping (char ordinal -> integer, Unicode char or None) */
3129 w = PyInt_FromLong((long)ch);
3130 if (w == NULL)
3131 goto onError;
3132 x = PyObject_GetItem(mapping, w);
3133 Py_DECREF(w);
3134 if (x == NULL) {
3135 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
3136 /* No mapping found means: mapping is undefined. */
3137 PyErr_Clear();
3138 x = Py_None;
3139 Py_INCREF(x);
3140 } else
3141 goto onError;
3142 }
3143
3144 /* Apply mapping */
3145 if (PyInt_Check(x)) {
3146 long value = PyInt_AS_LONG(x);
3147 if (value < 0 || value > 65535) {
3148 PyErr_SetString(PyExc_TypeError,
3149 "character mapping must be in range(65536)");
3150 Py_DECREF(x);
3151 goto onError;
3152 }
3153 *p++ = (Py_UNICODE)value;
3154 }
3155 else if (x == Py_None) {
3156 /* undefined mapping */
3157 outpos = p-PyUnicode_AS_UNICODE(v);
3158 startinpos = s-starts;
3159 endinpos = startinpos+1;
3160 if (unicode_decode_call_errorhandler(
3161 errors, &errorHandler,
3162 "charmap", "character maps to <undefined>",
3163 starts, size, &startinpos, &endinpos, &exc, &s,
3164 (PyObject **)&v, &outpos, &p)) {
3165 Py_DECREF(x);
3166 goto onError;
3167 }
Walter Dörwaldd4fff172005-11-28 22:15:56 +00003168 Py_DECREF(x);
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003169 continue;
3170 }
3171 else if (PyUnicode_Check(x)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00003172 Py_ssize_t targetsize = PyUnicode_GET_SIZE(x);
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003173
3174 if (targetsize == 1)
3175 /* 1-1 mapping */
3176 *p++ = *PyUnicode_AS_UNICODE(x);
3177
3178 else if (targetsize > 1) {
3179 /* 1-n mapping */
3180 if (targetsize > extrachars) {
3181 /* resize first */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003182 Py_ssize_t oldpos = p - PyUnicode_AS_UNICODE(v);
3183 Py_ssize_t needed = (targetsize - extrachars) + \
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003184 (targetsize << 2);
3185 extrachars += needed;
Thomas Wouters89f507f2006-12-13 04:49:30 +00003186 /* XXX overflow detection missing */
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003187 if (_PyUnicode_Resize(&v,
3188 PyUnicode_GET_SIZE(v) + needed) < 0) {
3189 Py_DECREF(x);
3190 goto onError;
3191 }
3192 p = PyUnicode_AS_UNICODE(v) + oldpos;
3193 }
3194 Py_UNICODE_COPY(p,
3195 PyUnicode_AS_UNICODE(x),
3196 targetsize);
3197 p += targetsize;
3198 extrachars -= targetsize;
3199 }
3200 /* 1-0 mapping: skip the character */
3201 }
3202 else {
3203 /* wrong return value */
3204 PyErr_SetString(PyExc_TypeError,
3205 "character mapping must return integer, None or unicode");
3206 Py_DECREF(x);
3207 goto onError;
3208 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003209 Py_DECREF(x);
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003210 ++s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003211 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003212 }
3213 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003214 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003215 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003216 Py_XDECREF(errorHandler);
3217 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003218 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00003219
Guido van Rossumd57fd912000-03-10 22:53:23 +00003220 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003221 Py_XDECREF(errorHandler);
3222 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003223 Py_XDECREF(v);
3224 return NULL;
3225}
3226
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003227/* Charmap encoding: the lookup table */
3228
3229struct encoding_map{
3230 PyObject_HEAD
3231 unsigned char level1[32];
3232 int count2, count3;
3233 unsigned char level23[1];
3234};
3235
3236static PyObject*
3237encoding_map_size(PyObject *obj, PyObject* args)
3238{
3239 struct encoding_map *map = (struct encoding_map*)obj;
3240 return PyInt_FromLong(sizeof(*map) - 1 + 16*map->count2 +
3241 128*map->count3);
3242}
3243
3244static PyMethodDef encoding_map_methods[] = {
3245 {"size", encoding_map_size, METH_NOARGS,
3246 PyDoc_STR("Return the size (in bytes) of this object") },
3247 { 0 }
3248};
3249
3250static void
3251encoding_map_dealloc(PyObject* o)
3252{
3253 PyObject_FREE(o);
3254}
3255
3256static PyTypeObject EncodingMapType = {
3257 PyObject_HEAD_INIT(NULL)
3258 0, /*ob_size*/
3259 "EncodingMap", /*tp_name*/
3260 sizeof(struct encoding_map), /*tp_basicsize*/
3261 0, /*tp_itemsize*/
3262 /* methods */
3263 encoding_map_dealloc, /*tp_dealloc*/
3264 0, /*tp_print*/
3265 0, /*tp_getattr*/
3266 0, /*tp_setattr*/
3267 0, /*tp_compare*/
3268 0, /*tp_repr*/
3269 0, /*tp_as_number*/
3270 0, /*tp_as_sequence*/
3271 0, /*tp_as_mapping*/
3272 0, /*tp_hash*/
3273 0, /*tp_call*/
3274 0, /*tp_str*/
3275 0, /*tp_getattro*/
3276 0, /*tp_setattro*/
3277 0, /*tp_as_buffer*/
3278 Py_TPFLAGS_DEFAULT, /*tp_flags*/
3279 0, /*tp_doc*/
3280 0, /*tp_traverse*/
3281 0, /*tp_clear*/
3282 0, /*tp_richcompare*/
3283 0, /*tp_weaklistoffset*/
3284 0, /*tp_iter*/
3285 0, /*tp_iternext*/
3286 encoding_map_methods, /*tp_methods*/
3287 0, /*tp_members*/
3288 0, /*tp_getset*/
3289 0, /*tp_base*/
3290 0, /*tp_dict*/
3291 0, /*tp_descr_get*/
3292 0, /*tp_descr_set*/
3293 0, /*tp_dictoffset*/
3294 0, /*tp_init*/
3295 0, /*tp_alloc*/
3296 0, /*tp_new*/
3297 0, /*tp_free*/
3298 0, /*tp_is_gc*/
3299};
3300
3301PyObject*
3302PyUnicode_BuildEncodingMap(PyObject* string)
3303{
3304 Py_UNICODE *decode;
3305 PyObject *result;
3306 struct encoding_map *mresult;
3307 int i;
3308 int need_dict = 0;
3309 unsigned char level1[32];
3310 unsigned char level2[512];
3311 unsigned char *mlevel1, *mlevel2, *mlevel3;
3312 int count2 = 0, count3 = 0;
3313
3314 if (!PyUnicode_Check(string) || PyUnicode_GetSize(string) != 256) {
3315 PyErr_BadArgument();
3316 return NULL;
3317 }
3318 decode = PyUnicode_AS_UNICODE(string);
3319 memset(level1, 0xFF, sizeof level1);
3320 memset(level2, 0xFF, sizeof level2);
3321
3322 /* If there isn't a one-to-one mapping of NULL to \0,
3323 or if there are non-BMP characters, we need to use
3324 a mapping dictionary. */
3325 if (decode[0] != 0)
3326 need_dict = 1;
3327 for (i = 1; i < 256; i++) {
3328 int l1, l2;
3329 if (decode[i] == 0
3330 #ifdef Py_UNICODE_WIDE
3331 || decode[i] > 0xFFFF
3332 #endif
3333 ) {
3334 need_dict = 1;
3335 break;
3336 }
3337 if (decode[i] == 0xFFFE)
3338 /* unmapped character */
3339 continue;
3340 l1 = decode[i] >> 11;
3341 l2 = decode[i] >> 7;
3342 if (level1[l1] == 0xFF)
3343 level1[l1] = count2++;
3344 if (level2[l2] == 0xFF)
3345 level2[l2] = count3++;
3346 }
3347
3348 if (count2 >= 0xFF || count3 >= 0xFF)
3349 need_dict = 1;
3350
3351 if (need_dict) {
3352 PyObject *result = PyDict_New();
3353 PyObject *key, *value;
3354 if (!result)
3355 return NULL;
3356 for (i = 0; i < 256; i++) {
3357 key = value = NULL;
3358 key = PyInt_FromLong(decode[i]);
3359 value = PyInt_FromLong(i);
3360 if (!key || !value)
3361 goto failed1;
3362 if (PyDict_SetItem(result, key, value) == -1)
3363 goto failed1;
3364 Py_DECREF(key);
3365 Py_DECREF(value);
3366 }
3367 return result;
3368 failed1:
3369 Py_XDECREF(key);
3370 Py_XDECREF(value);
3371 Py_DECREF(result);
3372 return NULL;
3373 }
3374
3375 /* Create a three-level trie */
3376 result = PyObject_MALLOC(sizeof(struct encoding_map) +
3377 16*count2 + 128*count3 - 1);
3378 if (!result)
3379 return PyErr_NoMemory();
3380 PyObject_Init(result, &EncodingMapType);
3381 mresult = (struct encoding_map*)result;
3382 mresult->count2 = count2;
3383 mresult->count3 = count3;
3384 mlevel1 = mresult->level1;
3385 mlevel2 = mresult->level23;
3386 mlevel3 = mresult->level23 + 16*count2;
3387 memcpy(mlevel1, level1, 32);
3388 memset(mlevel2, 0xFF, 16*count2);
3389 memset(mlevel3, 0, 128*count3);
3390 count3 = 0;
3391 for (i = 1; i < 256; i++) {
3392 int o1, o2, o3, i2, i3;
3393 if (decode[i] == 0xFFFE)
3394 /* unmapped character */
3395 continue;
3396 o1 = decode[i]>>11;
3397 o2 = (decode[i]>>7) & 0xF;
3398 i2 = 16*mlevel1[o1] + o2;
3399 if (mlevel2[i2] == 0xFF)
3400 mlevel2[i2] = count3++;
3401 o3 = decode[i] & 0x7F;
3402 i3 = 128*mlevel2[i2] + o3;
3403 mlevel3[i3] = i;
3404 }
3405 return result;
3406}
3407
3408static int
3409encoding_map_lookup(Py_UNICODE c, PyObject *mapping)
3410{
3411 struct encoding_map *map = (struct encoding_map*)mapping;
3412 int l1 = c>>11;
3413 int l2 = (c>>7) & 0xF;
3414 int l3 = c & 0x7F;
3415 int i;
3416
3417#ifdef Py_UNICODE_WIDE
3418 if (c > 0xFFFF) {
3419 return -1;
3420 }
3421#endif
3422 if (c == 0)
3423 return 0;
3424 /* level 1*/
3425 i = map->level1[l1];
3426 if (i == 0xFF) {
3427 return -1;
3428 }
3429 /* level 2*/
3430 i = map->level23[16*i+l2];
3431 if (i == 0xFF) {
3432 return -1;
3433 }
3434 /* level 3 */
3435 i = map->level23[16*map->count2 + 128*i + l3];
3436 if (i == 0) {
3437 return -1;
3438 }
3439 return i;
3440}
3441
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003442/* Lookup the character ch in the mapping. If the character
3443 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00003444 error occurred). */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003445static PyObject *charmapencode_lookup(Py_UNICODE c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003446{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003447 PyObject *w = PyInt_FromLong((long)c);
3448 PyObject *x;
3449
3450 if (w == NULL)
3451 return NULL;
3452 x = PyObject_GetItem(mapping, w);
3453 Py_DECREF(w);
3454 if (x == NULL) {
3455 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
3456 /* No mapping found means: mapping is undefined. */
3457 PyErr_Clear();
3458 x = Py_None;
3459 Py_INCREF(x);
3460 return x;
3461 } else
3462 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003463 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00003464 else if (x == Py_None)
3465 return x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003466 else if (PyInt_Check(x)) {
3467 long value = PyInt_AS_LONG(x);
3468 if (value < 0 || value > 255) {
3469 PyErr_SetString(PyExc_TypeError,
3470 "character mapping must be in range(256)");
3471 Py_DECREF(x);
3472 return NULL;
3473 }
3474 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003475 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003476 else if (PyString_Check(x))
3477 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003478 else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003479 /* wrong return value */
3480 PyErr_SetString(PyExc_TypeError,
3481 "character mapping must return integer, None or str");
3482 Py_DECREF(x);
3483 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003484 }
3485}
3486
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003487static int
3488charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
3489{
3490 Py_ssize_t outsize = PyString_GET_SIZE(*outobj);
3491 /* exponentially overallocate to minimize reallocations */
3492 if (requiredsize < 2*outsize)
3493 requiredsize = 2*outsize;
3494 if (_PyString_Resize(outobj, requiredsize)) {
3495 return 0;
3496 }
3497 return 1;
3498}
3499
3500typedef enum charmapencode_result {
3501 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
3502}charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003503/* lookup the character, put the result in the output string and adjust
3504 various state variables. Reallocate the output string if not enough
3505 space is available. Return a new reference to the object that
3506 was put in the output buffer, or Py_None, if the mapping was undefined
3507 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00003508 reallocation error occurred. The caller must decref the result */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003509static
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003510charmapencode_result charmapencode_output(Py_UNICODE c, PyObject *mapping,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003511 PyObject **outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003512{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003513 PyObject *rep;
3514 char *outstart;
3515 Py_ssize_t outsize = PyString_GET_SIZE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003516
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003517 if (mapping->ob_type == &EncodingMapType) {
3518 int res = encoding_map_lookup(c, mapping);
3519 Py_ssize_t requiredsize = *outpos+1;
3520 if (res == -1)
3521 return enc_FAILED;
3522 if (outsize<requiredsize)
3523 if (!charmapencode_resize(outobj, outpos, requiredsize))
3524 return enc_EXCEPTION;
3525 outstart = PyString_AS_STRING(*outobj);
3526 outstart[(*outpos)++] = (char)res;
3527 return enc_SUCCESS;
3528 }
3529
3530 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003531 if (rep==NULL)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003532 return enc_EXCEPTION;
3533 else if (rep==Py_None) {
3534 Py_DECREF(rep);
3535 return enc_FAILED;
3536 } else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003537 if (PyInt_Check(rep)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00003538 Py_ssize_t requiredsize = *outpos+1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003539 if (outsize<requiredsize)
3540 if (!charmapencode_resize(outobj, outpos, requiredsize)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003541 Py_DECREF(rep);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003542 return enc_EXCEPTION;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003543 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003544 outstart = PyString_AS_STRING(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003545 outstart[(*outpos)++] = (char)PyInt_AS_LONG(rep);
3546 }
3547 else {
3548 const char *repchars = PyString_AS_STRING(rep);
Martin v. Löwis18e16552006-02-15 17:27:45 +00003549 Py_ssize_t repsize = PyString_GET_SIZE(rep);
3550 Py_ssize_t requiredsize = *outpos+repsize;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003551 if (outsize<requiredsize)
3552 if (!charmapencode_resize(outobj, outpos, requiredsize)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003553 Py_DECREF(rep);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003554 return enc_EXCEPTION;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003555 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003556 outstart = PyString_AS_STRING(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003557 memcpy(outstart + *outpos, repchars, repsize);
3558 *outpos += repsize;
3559 }
3560 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003561 Py_DECREF(rep);
3562 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003563}
3564
3565/* handle an error in PyUnicode_EncodeCharmap
3566 Return 0 on success, -1 on error */
3567static
3568int charmap_encoding_error(
Martin v. Löwis18e16552006-02-15 17:27:45 +00003569 const Py_UNICODE *p, Py_ssize_t size, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003570 PyObject **exceptionObject,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00003571 int *known_errorHandler, PyObject **errorHandler, const char *errors,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003572 PyObject **res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003573{
3574 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003575 Py_ssize_t repsize;
3576 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003577 Py_UNICODE *uni2;
3578 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003579 Py_ssize_t collstartpos = *inpos;
3580 Py_ssize_t collendpos = *inpos+1;
3581 Py_ssize_t collpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003582 char *encoding = "charmap";
3583 char *reason = "character maps to <undefined>";
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003584 charmapencode_result x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003585
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003586 /* find all unencodable characters */
3587 while (collendpos < size) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003588 PyObject *rep;
3589 if (mapping->ob_type == &EncodingMapType) {
3590 int res = encoding_map_lookup(p[collendpos], mapping);
3591 if (res != -1)
3592 break;
3593 ++collendpos;
3594 continue;
3595 }
3596
3597 rep = charmapencode_lookup(p[collendpos], mapping);
3598 if (rep==NULL)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003599 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003600 else if (rep!=Py_None) {
3601 Py_DECREF(rep);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003602 break;
3603 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003604 Py_DECREF(rep);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003605 ++collendpos;
3606 }
3607 /* cache callback name lookup
3608 * (if not done yet, i.e. it's the first error) */
3609 if (*known_errorHandler==-1) {
3610 if ((errors==NULL) || (!strcmp(errors, "strict")))
3611 *known_errorHandler = 1;
3612 else if (!strcmp(errors, "replace"))
3613 *known_errorHandler = 2;
3614 else if (!strcmp(errors, "ignore"))
3615 *known_errorHandler = 3;
3616 else if (!strcmp(errors, "xmlcharrefreplace"))
3617 *known_errorHandler = 4;
3618 else
3619 *known_errorHandler = 0;
3620 }
3621 switch (*known_errorHandler) {
3622 case 1: /* strict */
3623 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
3624 return -1;
3625 case 2: /* replace */
3626 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
3627 x = charmapencode_output('?', mapping, res, respos);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003628 if (x==enc_EXCEPTION) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003629 return -1;
3630 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003631 else if (x==enc_FAILED) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003632 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
3633 return -1;
3634 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003635 }
3636 /* fall through */
3637 case 3: /* ignore */
3638 *inpos = collendpos;
3639 break;
3640 case 4: /* xmlcharrefreplace */
3641 /* generate replacement (temporarily (mis)uses p) */
3642 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
3643 char buffer[2+29+1+1];
3644 char *cp;
3645 sprintf(buffer, "&#%d;", (int)p[collpos]);
3646 for (cp = buffer; *cp; ++cp) {
3647 x = charmapencode_output(*cp, mapping, res, respos);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003648 if (x==enc_EXCEPTION)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003649 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003650 else if (x==enc_FAILED) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003651 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
3652 return -1;
3653 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003654 }
3655 }
3656 *inpos = collendpos;
3657 break;
3658 default:
Walter Dörwalde5402fb2003-08-14 20:25:29 +00003659 repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003660 encoding, reason, p, size, exceptionObject,
3661 collstartpos, collendpos, &newpos);
3662 if (repunicode == NULL)
3663 return -1;
3664 /* generate replacement */
3665 repsize = PyUnicode_GET_SIZE(repunicode);
3666 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
3667 x = charmapencode_output(*uni2, mapping, res, respos);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003668 if (x==enc_EXCEPTION) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003669 return -1;
3670 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003671 else if (x==enc_FAILED) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003672 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003673 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
3674 return -1;
3675 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003676 }
3677 *inpos = newpos;
3678 Py_DECREF(repunicode);
3679 }
3680 return 0;
3681}
3682
Guido van Rossumd57fd912000-03-10 22:53:23 +00003683PyObject *PyUnicode_EncodeCharmap(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003684 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003685 PyObject *mapping,
3686 const char *errors)
3687{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003688 /* output object */
3689 PyObject *res = NULL;
3690 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003691 Py_ssize_t inpos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003692 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003693 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003694 PyObject *errorHandler = NULL;
3695 PyObject *exc = NULL;
3696 /* the following variable is used for caching string comparisons
3697 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
3698 * 3=ignore, 4=xmlcharrefreplace */
3699 int known_errorHandler = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003700
3701 /* Default to Latin-1 */
3702 if (mapping == NULL)
3703 return PyUnicode_EncodeLatin1(p, size, errors);
3704
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003705 /* allocate enough for a simple encoding without
3706 replacements, if we need more, we'll resize */
3707 res = PyString_FromStringAndSize(NULL, size);
3708 if (res == NULL)
3709 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00003710 if (size == 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003711 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003712
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003713 while (inpos<size) {
3714 /* try to encode it */
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003715 charmapencode_result x = charmapencode_output(p[inpos], mapping, &res, &respos);
3716 if (x==enc_EXCEPTION) /* error */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003717 goto onError;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003718 if (x==enc_FAILED) { /* unencodable character */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003719 if (charmap_encoding_error(p, size, &inpos, mapping,
3720 &exc,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00003721 &known_errorHandler, &errorHandler, errors,
Walter Dörwald9b30f202003-08-15 16:26:34 +00003722 &res, &respos)) {
Marc-André Lemburg3a645e42001-01-16 11:54:12 +00003723 goto onError;
Walter Dörwald9b30f202003-08-15 16:26:34 +00003724 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003725 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003726 else
3727 /* done with this character => adjust input position */
3728 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003729 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003730
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003731 /* Resize if we allocated to much */
3732 if (respos<PyString_GET_SIZE(res)) {
3733 if (_PyString_Resize(&res, respos))
3734 goto onError;
3735 }
3736 Py_XDECREF(exc);
3737 Py_XDECREF(errorHandler);
3738 return res;
3739
3740 onError:
3741 Py_XDECREF(res);
3742 Py_XDECREF(exc);
3743 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003744 return NULL;
3745}
3746
3747PyObject *PyUnicode_AsCharmapString(PyObject *unicode,
3748 PyObject *mapping)
3749{
3750 if (!PyUnicode_Check(unicode) || mapping == NULL) {
3751 PyErr_BadArgument();
3752 return NULL;
3753 }
3754 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
3755 PyUnicode_GET_SIZE(unicode),
3756 mapping,
3757 NULL);
3758}
3759
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003760/* create or adjust a UnicodeTranslateError */
3761static void make_translate_exception(PyObject **exceptionObject,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003762 const Py_UNICODE *unicode, Py_ssize_t size,
3763 Py_ssize_t startpos, Py_ssize_t endpos,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003764 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003765{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003766 if (*exceptionObject == NULL) {
3767 *exceptionObject = PyUnicodeTranslateError_Create(
3768 unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003769 }
3770 else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003771 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
3772 goto onError;
3773 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
3774 goto onError;
3775 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
3776 goto onError;
3777 return;
3778 onError:
3779 Py_DECREF(*exceptionObject);
3780 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003781 }
3782}
3783
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003784/* raises a UnicodeTranslateError */
3785static void raise_translate_exception(PyObject **exceptionObject,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003786 const Py_UNICODE *unicode, Py_ssize_t size,
3787 Py_ssize_t startpos, Py_ssize_t endpos,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003788 const char *reason)
3789{
3790 make_translate_exception(exceptionObject,
3791 unicode, size, startpos, endpos, reason);
3792 if (*exceptionObject != NULL)
3793 PyCodec_StrictErrors(*exceptionObject);
3794}
3795
3796/* error handling callback helper:
3797 build arguments, call the callback and check the arguments,
3798 put the result into newpos and return the replacement string, which
3799 has to be freed by the caller */
3800static PyObject *unicode_translate_call_errorhandler(const char *errors,
3801 PyObject **errorHandler,
3802 const char *reason,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003803 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
3804 Py_ssize_t startpos, Py_ssize_t endpos,
3805 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003806{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003807 static char *argparse = "O!n;translating error handler must return (unicode, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003808
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003809 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003810 PyObject *restuple;
3811 PyObject *resunicode;
3812
3813 if (*errorHandler == NULL) {
3814 *errorHandler = PyCodec_LookupError(errors);
3815 if (*errorHandler == NULL)
3816 return NULL;
3817 }
3818
3819 make_translate_exception(exceptionObject,
3820 unicode, size, startpos, endpos, reason);
3821 if (*exceptionObject == NULL)
3822 return NULL;
3823
3824 restuple = PyObject_CallFunctionObjArgs(
3825 *errorHandler, *exceptionObject, NULL);
3826 if (restuple == NULL)
3827 return NULL;
3828 if (!PyTuple_Check(restuple)) {
3829 PyErr_Format(PyExc_TypeError, &argparse[4]);
3830 Py_DECREF(restuple);
3831 return NULL;
3832 }
3833 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003834 &resunicode, &i_newpos)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003835 Py_DECREF(restuple);
3836 return NULL;
3837 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00003838 if (i_newpos<0)
3839 *newpos = size+i_newpos;
3840 else
3841 *newpos = i_newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003842 if (*newpos<0 || *newpos>size) {
Martin v. Löwis2c95cc62006-02-16 06:54:25 +00003843 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003844 Py_DECREF(restuple);
3845 return NULL;
3846 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003847 Py_INCREF(resunicode);
3848 Py_DECREF(restuple);
3849 return resunicode;
3850}
3851
3852/* Lookup the character ch in the mapping and put the result in result,
3853 which must be decrefed by the caller.
3854 Return 0 on success, -1 on error */
3855static
3856int charmaptranslate_lookup(Py_UNICODE c, PyObject *mapping, PyObject **result)
3857{
3858 PyObject *w = PyInt_FromLong((long)c);
3859 PyObject *x;
3860
3861 if (w == NULL)
3862 return -1;
3863 x = PyObject_GetItem(mapping, w);
3864 Py_DECREF(w);
3865 if (x == NULL) {
3866 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
3867 /* No mapping found means: use 1:1 mapping. */
3868 PyErr_Clear();
3869 *result = NULL;
3870 return 0;
3871 } else
3872 return -1;
3873 }
3874 else if (x == Py_None) {
3875 *result = x;
3876 return 0;
3877 }
3878 else if (PyInt_Check(x)) {
3879 long value = PyInt_AS_LONG(x);
3880 long max = PyUnicode_GetMax();
3881 if (value < 0 || value > max) {
3882 PyErr_Format(PyExc_TypeError,
3883 "character mapping must be in range(0x%lx)", max+1);
3884 Py_DECREF(x);
3885 return -1;
3886 }
3887 *result = x;
3888 return 0;
3889 }
3890 else if (PyUnicode_Check(x)) {
3891 *result = x;
3892 return 0;
3893 }
3894 else {
3895 /* wrong return value */
3896 PyErr_SetString(PyExc_TypeError,
3897 "character mapping must return integer, None or unicode");
Walter Dörwald150523e2003-08-15 16:52:19 +00003898 Py_DECREF(x);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003899 return -1;
3900 }
3901}
3902/* ensure that *outobj is at least requiredsize characters long,
3903if not reallocate and adjust various state variables.
3904Return 0 on success, -1 on error */
3905static
Walter Dörwald4894c302003-10-24 14:25:28 +00003906int charmaptranslate_makespace(PyObject **outobj, Py_UNICODE **outp,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003907 Py_ssize_t requiredsize)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003908{
Martin v. Löwis18e16552006-02-15 17:27:45 +00003909 Py_ssize_t oldsize = PyUnicode_GET_SIZE(*outobj);
Walter Dörwald4894c302003-10-24 14:25:28 +00003910 if (requiredsize > oldsize) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003911 /* remember old output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003912 Py_ssize_t outpos = *outp-PyUnicode_AS_UNICODE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003913 /* exponentially overallocate to minimize reallocations */
Walter Dörwald4894c302003-10-24 14:25:28 +00003914 if (requiredsize < 2 * oldsize)
3915 requiredsize = 2 * oldsize;
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00003916 if (_PyUnicode_Resize(outobj, requiredsize) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003917 return -1;
3918 *outp = PyUnicode_AS_UNICODE(*outobj) + outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003919 }
3920 return 0;
3921}
3922/* lookup the character, put the result in the output string and adjust
3923 various state variables. Return a new reference to the object that
3924 was put in the output buffer in *result, or Py_None, if the mapping was
3925 undefined (in which case no character was written).
3926 The called must decref result.
3927 Return 0 on success, -1 on error. */
3928static
Walter Dörwald4894c302003-10-24 14:25:28 +00003929int charmaptranslate_output(const Py_UNICODE *startinp, const Py_UNICODE *curinp,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003930 Py_ssize_t insize, PyObject *mapping, PyObject **outobj, Py_UNICODE **outp,
Walter Dörwald4894c302003-10-24 14:25:28 +00003931 PyObject **res)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003932{
Walter Dörwald4894c302003-10-24 14:25:28 +00003933 if (charmaptranslate_lookup(*curinp, mapping, res))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003934 return -1;
3935 if (*res==NULL) {
3936 /* not found => default to 1:1 mapping */
Walter Dörwald4894c302003-10-24 14:25:28 +00003937 *(*outp)++ = *curinp;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003938 }
3939 else if (*res==Py_None)
3940 ;
3941 else if (PyInt_Check(*res)) {
3942 /* no overflow check, because we know that the space is enough */
3943 *(*outp)++ = (Py_UNICODE)PyInt_AS_LONG(*res);
3944 }
3945 else if (PyUnicode_Check(*res)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00003946 Py_ssize_t repsize = PyUnicode_GET_SIZE(*res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003947 if (repsize==1) {
3948 /* no overflow check, because we know that the space is enough */
3949 *(*outp)++ = *PyUnicode_AS_UNICODE(*res);
3950 }
3951 else if (repsize!=0) {
3952 /* more than one character */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003953 Py_ssize_t requiredsize = (*outp-PyUnicode_AS_UNICODE(*outobj)) +
Walter Dörwaldcd736e72004-02-05 17:36:00 +00003954 (insize - (curinp-startinp)) +
Walter Dörwald4894c302003-10-24 14:25:28 +00003955 repsize - 1;
3956 if (charmaptranslate_makespace(outobj, outp, requiredsize))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003957 return -1;
3958 memcpy(*outp, PyUnicode_AS_UNICODE(*res), sizeof(Py_UNICODE)*repsize);
3959 *outp += repsize;
3960 }
3961 }
3962 else
3963 return -1;
3964 return 0;
3965}
3966
3967PyObject *PyUnicode_TranslateCharmap(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003968 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003969 PyObject *mapping,
3970 const char *errors)
3971{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003972 /* output object */
3973 PyObject *res = NULL;
3974 /* pointers to the beginning and end+1 of input */
3975 const Py_UNICODE *startp = p;
3976 const Py_UNICODE *endp = p + size;
3977 /* pointer into the output */
3978 Py_UNICODE *str;
3979 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003980 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003981 char *reason = "character maps to <undefined>";
3982 PyObject *errorHandler = NULL;
3983 PyObject *exc = NULL;
3984 /* the following variable is used for caching string comparisons
3985 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
3986 * 3=ignore, 4=xmlcharrefreplace */
3987 int known_errorHandler = -1;
3988
Guido van Rossumd57fd912000-03-10 22:53:23 +00003989 if (mapping == NULL) {
3990 PyErr_BadArgument();
3991 return NULL;
3992 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003993
3994 /* allocate enough for a simple 1:1 translation without
3995 replacements, if we need more, we'll resize */
3996 res = PyUnicode_FromUnicode(NULL, size);
3997 if (res == NULL)
Walter Dörwald4894c302003-10-24 14:25:28 +00003998 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003999 if (size == 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004000 return res;
4001 str = PyUnicode_AS_UNICODE(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004002
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004003 while (p<endp) {
4004 /* try to encode it */
4005 PyObject *x = NULL;
Walter Dörwald4894c302003-10-24 14:25:28 +00004006 if (charmaptranslate_output(startp, p, size, mapping, &res, &str, &x)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004007 Py_XDECREF(x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004008 goto onError;
4009 }
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00004010 Py_XDECREF(x);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004011 if (x!=Py_None) /* it worked => adjust input pointer */
4012 ++p;
4013 else { /* untranslatable character */
4014 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004015 Py_ssize_t repsize;
4016 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004017 Py_UNICODE *uni2;
4018 /* startpos for collecting untranslatable chars */
4019 const Py_UNICODE *collstart = p;
4020 const Py_UNICODE *collend = p+1;
4021 const Py_UNICODE *coll;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004022
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004023 /* find all untranslatable characters */
4024 while (collend < endp) {
Walter Dörwald4894c302003-10-24 14:25:28 +00004025 if (charmaptranslate_lookup(*collend, mapping, &x))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004026 goto onError;
4027 Py_XDECREF(x);
4028 if (x!=Py_None)
4029 break;
4030 ++collend;
4031 }
4032 /* cache callback name lookup
4033 * (if not done yet, i.e. it's the first error) */
4034 if (known_errorHandler==-1) {
4035 if ((errors==NULL) || (!strcmp(errors, "strict")))
4036 known_errorHandler = 1;
4037 else if (!strcmp(errors, "replace"))
4038 known_errorHandler = 2;
4039 else if (!strcmp(errors, "ignore"))
4040 known_errorHandler = 3;
4041 else if (!strcmp(errors, "xmlcharrefreplace"))
4042 known_errorHandler = 4;
4043 else
4044 known_errorHandler = 0;
4045 }
4046 switch (known_errorHandler) {
4047 case 1: /* strict */
4048 raise_translate_exception(&exc, startp, size, collstart-startp, collend-startp, reason);
4049 goto onError;
4050 case 2: /* replace */
4051 /* No need to check for space, this is a 1:1 replacement */
4052 for (coll = collstart; coll<collend; ++coll)
4053 *str++ = '?';
4054 /* fall through */
4055 case 3: /* ignore */
4056 p = collend;
4057 break;
4058 case 4: /* xmlcharrefreplace */
4059 /* generate replacement (temporarily (mis)uses p) */
4060 for (p = collstart; p < collend; ++p) {
4061 char buffer[2+29+1+1];
4062 char *cp;
4063 sprintf(buffer, "&#%d;", (int)*p);
Walter Dörwald4894c302003-10-24 14:25:28 +00004064 if (charmaptranslate_makespace(&res, &str,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004065 (str-PyUnicode_AS_UNICODE(res))+strlen(buffer)+(endp-collend)))
4066 goto onError;
4067 for (cp = buffer; *cp; ++cp)
4068 *str++ = *cp;
4069 }
4070 p = collend;
4071 break;
4072 default:
4073 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
4074 reason, startp, size, &exc,
4075 collstart-startp, collend-startp, &newpos);
4076 if (repunicode == NULL)
4077 goto onError;
4078 /* generate replacement */
4079 repsize = PyUnicode_GET_SIZE(repunicode);
Walter Dörwald4894c302003-10-24 14:25:28 +00004080 if (charmaptranslate_makespace(&res, &str,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004081 (str-PyUnicode_AS_UNICODE(res))+repsize+(endp-collend))) {
4082 Py_DECREF(repunicode);
4083 goto onError;
4084 }
4085 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2)
4086 *str++ = *uni2;
4087 p = startp + newpos;
4088 Py_DECREF(repunicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004089 }
4090 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004091 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004092 /* Resize if we allocated to much */
4093 respos = str-PyUnicode_AS_UNICODE(res);
Walter Dörwald4894c302003-10-24 14:25:28 +00004094 if (respos<PyUnicode_GET_SIZE(res)) {
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00004095 if (_PyUnicode_Resize(&res, respos) < 0)
Guido van Rossumfd4b9572000-04-10 13:51:10 +00004096 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004097 }
4098 Py_XDECREF(exc);
4099 Py_XDECREF(errorHandler);
4100 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004101
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004102 onError:
4103 Py_XDECREF(res);
4104 Py_XDECREF(exc);
4105 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004106 return NULL;
4107}
4108
4109PyObject *PyUnicode_Translate(PyObject *str,
4110 PyObject *mapping,
4111 const char *errors)
4112{
4113 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00004114
Guido van Rossumd57fd912000-03-10 22:53:23 +00004115 str = PyUnicode_FromObject(str);
4116 if (str == NULL)
4117 goto onError;
4118 result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str),
4119 PyUnicode_GET_SIZE(str),
4120 mapping,
4121 errors);
4122 Py_DECREF(str);
4123 return result;
Tim Petersced69f82003-09-16 20:30:58 +00004124
Guido van Rossumd57fd912000-03-10 22:53:23 +00004125 onError:
4126 Py_XDECREF(str);
4127 return NULL;
4128}
Tim Petersced69f82003-09-16 20:30:58 +00004129
Guido van Rossum9e896b32000-04-05 20:11:21 +00004130/* --- Decimal Encoder ---------------------------------------------------- */
4131
4132int PyUnicode_EncodeDecimal(Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004133 Py_ssize_t length,
Guido van Rossum9e896b32000-04-05 20:11:21 +00004134 char *output,
4135 const char *errors)
4136{
4137 Py_UNICODE *p, *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004138 PyObject *errorHandler = NULL;
4139 PyObject *exc = NULL;
4140 const char *encoding = "decimal";
4141 const char *reason = "invalid decimal Unicode string";
4142 /* the following variable is used for caching string comparisons
4143 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
4144 int known_errorHandler = -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00004145
4146 if (output == NULL) {
4147 PyErr_BadArgument();
4148 return -1;
4149 }
4150
4151 p = s;
4152 end = s + length;
4153 while (p < end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004154 register Py_UNICODE ch = *p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00004155 int decimal;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004156 PyObject *repunicode;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004157 Py_ssize_t repsize;
4158 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004159 Py_UNICODE *uni2;
4160 Py_UNICODE *collstart;
4161 Py_UNICODE *collend;
Tim Petersced69f82003-09-16 20:30:58 +00004162
Guido van Rossum9e896b32000-04-05 20:11:21 +00004163 if (Py_UNICODE_ISSPACE(ch)) {
4164 *output++ = ' ';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004165 ++p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00004166 continue;
4167 }
4168 decimal = Py_UNICODE_TODECIMAL(ch);
4169 if (decimal >= 0) {
4170 *output++ = '0' + decimal;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004171 ++p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00004172 continue;
4173 }
Guido van Rossumba477042000-04-06 18:18:10 +00004174 if (0 < ch && ch < 256) {
Guido van Rossum42c29aa2000-05-03 23:58:29 +00004175 *output++ = (char)ch;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004176 ++p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00004177 continue;
4178 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004179 /* All other characters are considered unencodable */
4180 collstart = p;
4181 collend = p+1;
4182 while (collend < end) {
4183 if ((0 < *collend && *collend < 256) ||
4184 !Py_UNICODE_ISSPACE(*collend) ||
4185 Py_UNICODE_TODECIMAL(*collend))
4186 break;
Guido van Rossum9e896b32000-04-05 20:11:21 +00004187 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004188 /* cache callback name lookup
4189 * (if not done yet, i.e. it's the first error) */
4190 if (known_errorHandler==-1) {
4191 if ((errors==NULL) || (!strcmp(errors, "strict")))
4192 known_errorHandler = 1;
4193 else if (!strcmp(errors, "replace"))
4194 known_errorHandler = 2;
4195 else if (!strcmp(errors, "ignore"))
4196 known_errorHandler = 3;
4197 else if (!strcmp(errors, "xmlcharrefreplace"))
4198 known_errorHandler = 4;
4199 else
4200 known_errorHandler = 0;
4201 }
4202 switch (known_errorHandler) {
4203 case 1: /* strict */
4204 raise_encode_exception(&exc, encoding, s, length, collstart-s, collend-s, reason);
4205 goto onError;
4206 case 2: /* replace */
4207 for (p = collstart; p < collend; ++p)
4208 *output++ = '?';
4209 /* fall through */
4210 case 3: /* ignore */
4211 p = collend;
4212 break;
4213 case 4: /* xmlcharrefreplace */
4214 /* generate replacement (temporarily (mis)uses p) */
4215 for (p = collstart; p < collend; ++p)
4216 output += sprintf(output, "&#%d;", (int)*p);
4217 p = collend;
4218 break;
4219 default:
4220 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
4221 encoding, reason, s, length, &exc,
4222 collstart-s, collend-s, &newpos);
4223 if (repunicode == NULL)
4224 goto onError;
4225 /* generate replacement */
4226 repsize = PyUnicode_GET_SIZE(repunicode);
4227 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
4228 Py_UNICODE ch = *uni2;
4229 if (Py_UNICODE_ISSPACE(ch))
4230 *output++ = ' ';
4231 else {
4232 decimal = Py_UNICODE_TODECIMAL(ch);
4233 if (decimal >= 0)
4234 *output++ = '0' + decimal;
4235 else if (0 < ch && ch < 256)
4236 *output++ = (char)ch;
4237 else {
4238 Py_DECREF(repunicode);
4239 raise_encode_exception(&exc, encoding,
4240 s, length, collstart-s, collend-s, reason);
4241 goto onError;
4242 }
4243 }
4244 }
4245 p = s + newpos;
4246 Py_DECREF(repunicode);
Guido van Rossum9e896b32000-04-05 20:11:21 +00004247 }
4248 }
4249 /* 0-terminate the output string */
4250 *output++ = '\0';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004251 Py_XDECREF(exc);
4252 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00004253 return 0;
4254
4255 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004256 Py_XDECREF(exc);
4257 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00004258 return -1;
4259}
4260
Guido van Rossumd57fd912000-03-10 22:53:23 +00004261/* --- Helpers ------------------------------------------------------------ */
4262
Thomas Wouters477c8d52006-05-27 19:21:47 +00004263#define STRINGLIB_CHAR Py_UNICODE
4264
4265#define STRINGLIB_LEN PyUnicode_GET_SIZE
4266#define STRINGLIB_NEW PyUnicode_FromUnicode
4267#define STRINGLIB_STR PyUnicode_AS_UNICODE
4268
4269Py_LOCAL_INLINE(int)
4270STRINGLIB_CMP(const Py_UNICODE* str, const Py_UNICODE* other, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004271{
Thomas Wouters477c8d52006-05-27 19:21:47 +00004272 if (str[0] != other[0])
4273 return 1;
4274 return memcmp((void*) str, (void*) other, len * sizeof(Py_UNICODE));
Guido van Rossumd57fd912000-03-10 22:53:23 +00004275}
4276
Thomas Wouters477c8d52006-05-27 19:21:47 +00004277#define STRINGLIB_EMPTY unicode_empty
4278
4279#include "stringlib/fastsearch.h"
4280
4281#include "stringlib/count.h"
4282#include "stringlib/find.h"
4283#include "stringlib/partition.h"
4284
4285/* helper macro to fixup start/end slice values */
4286#define FIX_START_END(obj) \
4287 if (start < 0) \
4288 start += (obj)->length; \
4289 if (start < 0) \
4290 start = 0; \
4291 if (end > (obj)->length) \
4292 end = (obj)->length; \
4293 if (end < 0) \
4294 end += (obj)->length; \
4295 if (end < 0) \
4296 end = 0;
4297
Martin v. Löwis18e16552006-02-15 17:27:45 +00004298Py_ssize_t PyUnicode_Count(PyObject *str,
Thomas Wouters477c8d52006-05-27 19:21:47 +00004299 PyObject *substr,
4300 Py_ssize_t start,
4301 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004302{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004303 Py_ssize_t result;
Thomas Wouters477c8d52006-05-27 19:21:47 +00004304 PyUnicodeObject* str_obj;
4305 PyUnicodeObject* sub_obj;
Tim Petersced69f82003-09-16 20:30:58 +00004306
Thomas Wouters477c8d52006-05-27 19:21:47 +00004307 str_obj = (PyUnicodeObject*) PyUnicode_FromObject(str);
4308 if (!str_obj)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004309 return -1;
Thomas Wouters477c8d52006-05-27 19:21:47 +00004310 sub_obj = (PyUnicodeObject*) PyUnicode_FromObject(substr);
4311 if (!sub_obj) {
4312 Py_DECREF(str_obj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004313 return -1;
4314 }
Tim Petersced69f82003-09-16 20:30:58 +00004315
Thomas Wouters477c8d52006-05-27 19:21:47 +00004316 FIX_START_END(str_obj);
Tim Petersced69f82003-09-16 20:30:58 +00004317
Thomas Wouters477c8d52006-05-27 19:21:47 +00004318 result = stringlib_count(
4319 str_obj->str + start, end - start, sub_obj->str, sub_obj->length
4320 );
4321
4322 Py_DECREF(sub_obj);
4323 Py_DECREF(str_obj);
4324
Guido van Rossumd57fd912000-03-10 22:53:23 +00004325 return result;
4326}
4327
Martin v. Löwis18e16552006-02-15 17:27:45 +00004328Py_ssize_t PyUnicode_Find(PyObject *str,
Thomas Wouters477c8d52006-05-27 19:21:47 +00004329 PyObject *sub,
4330 Py_ssize_t start,
4331 Py_ssize_t end,
4332 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004333{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004334 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00004335
Guido van Rossumd57fd912000-03-10 22:53:23 +00004336 str = PyUnicode_FromObject(str);
Thomas Wouters477c8d52006-05-27 19:21:47 +00004337 if (!str)
Marc-André Lemburg4da6fd62002-05-29 11:33:13 +00004338 return -2;
Thomas Wouters477c8d52006-05-27 19:21:47 +00004339 sub = PyUnicode_FromObject(sub);
4340 if (!sub) {
Marc-André Lemburg41644392002-05-29 13:46:29 +00004341 Py_DECREF(str);
Marc-André Lemburg4da6fd62002-05-29 11:33:13 +00004342 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004343 }
Tim Petersced69f82003-09-16 20:30:58 +00004344
Thomas Wouters477c8d52006-05-27 19:21:47 +00004345 if (direction > 0)
4346 result = stringlib_find_slice(
4347 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
4348 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
4349 start, end
4350 );
4351 else
4352 result = stringlib_rfind_slice(
4353 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
4354 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
4355 start, end
4356 );
4357
Guido van Rossumd57fd912000-03-10 22:53:23 +00004358 Py_DECREF(str);
Thomas Wouters477c8d52006-05-27 19:21:47 +00004359 Py_DECREF(sub);
4360
Guido van Rossumd57fd912000-03-10 22:53:23 +00004361 return result;
4362}
4363
Tim Petersced69f82003-09-16 20:30:58 +00004364static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004365int tailmatch(PyUnicodeObject *self,
4366 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004367 Py_ssize_t start,
4368 Py_ssize_t end,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004369 int direction)
4370{
Guido van Rossumd57fd912000-03-10 22:53:23 +00004371 if (substring->length == 0)
4372 return 1;
4373
Thomas Wouters477c8d52006-05-27 19:21:47 +00004374 FIX_START_END(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004375
4376 end -= substring->length;
4377 if (end < start)
4378 return 0;
4379
4380 if (direction > 0) {
4381 if (Py_UNICODE_MATCH(self, end, substring))
4382 return 1;
4383 } else {
4384 if (Py_UNICODE_MATCH(self, start, substring))
4385 return 1;
4386 }
4387
4388 return 0;
4389}
4390
Martin v. Löwis18e16552006-02-15 17:27:45 +00004391Py_ssize_t PyUnicode_Tailmatch(PyObject *str,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004392 PyObject *substr,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004393 Py_ssize_t start,
4394 Py_ssize_t end,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004395 int direction)
4396{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004397 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00004398
Guido van Rossumd57fd912000-03-10 22:53:23 +00004399 str = PyUnicode_FromObject(str);
4400 if (str == NULL)
4401 return -1;
4402 substr = PyUnicode_FromObject(substr);
4403 if (substr == NULL) {
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00004404 Py_DECREF(str);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004405 return -1;
4406 }
Tim Petersced69f82003-09-16 20:30:58 +00004407
Guido van Rossumd57fd912000-03-10 22:53:23 +00004408 result = tailmatch((PyUnicodeObject *)str,
4409 (PyUnicodeObject *)substr,
4410 start, end, direction);
4411 Py_DECREF(str);
4412 Py_DECREF(substr);
4413 return result;
4414}
4415
Guido van Rossumd57fd912000-03-10 22:53:23 +00004416/* Apply fixfct filter to the Unicode object self and return a
4417 reference to the modified object */
4418
Tim Petersced69f82003-09-16 20:30:58 +00004419static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004420PyObject *fixup(PyUnicodeObject *self,
4421 int (*fixfct)(PyUnicodeObject *s))
4422{
4423
4424 PyUnicodeObject *u;
4425
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004426 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004427 if (u == NULL)
4428 return NULL;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004429
4430 Py_UNICODE_COPY(u->str, self->str, self->length);
4431
Tim Peters7a29bd52001-09-12 03:03:31 +00004432 if (!fixfct(u) && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004433 /* fixfct should return TRUE if it modified the buffer. If
4434 FALSE, return a reference to the original buffer instead
4435 (to save space, not time) */
4436 Py_INCREF(self);
4437 Py_DECREF(u);
4438 return (PyObject*) self;
4439 }
4440 return (PyObject*) u;
4441}
4442
Tim Petersced69f82003-09-16 20:30:58 +00004443static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004444int fixupper(PyUnicodeObject *self)
4445{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004446 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004447 Py_UNICODE *s = self->str;
4448 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00004449
Guido van Rossumd57fd912000-03-10 22:53:23 +00004450 while (len-- > 0) {
4451 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00004452
Guido van Rossumd57fd912000-03-10 22:53:23 +00004453 ch = Py_UNICODE_TOUPPER(*s);
4454 if (ch != *s) {
4455 status = 1;
4456 *s = ch;
4457 }
4458 s++;
4459 }
4460
4461 return status;
4462}
4463
Tim Petersced69f82003-09-16 20:30:58 +00004464static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004465int fixlower(PyUnicodeObject *self)
4466{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004467 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004468 Py_UNICODE *s = self->str;
4469 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00004470
Guido van Rossumd57fd912000-03-10 22:53:23 +00004471 while (len-- > 0) {
4472 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00004473
Guido van Rossumd57fd912000-03-10 22:53:23 +00004474 ch = Py_UNICODE_TOLOWER(*s);
4475 if (ch != *s) {
4476 status = 1;
4477 *s = ch;
4478 }
4479 s++;
4480 }
4481
4482 return status;
4483}
4484
Tim Petersced69f82003-09-16 20:30:58 +00004485static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004486int fixswapcase(PyUnicodeObject *self)
4487{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004488 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004489 Py_UNICODE *s = self->str;
4490 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00004491
Guido van Rossumd57fd912000-03-10 22:53:23 +00004492 while (len-- > 0) {
4493 if (Py_UNICODE_ISUPPER(*s)) {
4494 *s = Py_UNICODE_TOLOWER(*s);
4495 status = 1;
4496 } else if (Py_UNICODE_ISLOWER(*s)) {
4497 *s = Py_UNICODE_TOUPPER(*s);
4498 status = 1;
4499 }
4500 s++;
4501 }
4502
4503 return status;
4504}
4505
Tim Petersced69f82003-09-16 20:30:58 +00004506static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004507int fixcapitalize(PyUnicodeObject *self)
4508{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004509 Py_ssize_t len = self->length;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00004510 Py_UNICODE *s = self->str;
4511 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00004512
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00004513 if (len == 0)
4514 return 0;
4515 if (Py_UNICODE_ISLOWER(*s)) {
4516 *s = Py_UNICODE_TOUPPER(*s);
4517 status = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004518 }
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00004519 s++;
4520 while (--len > 0) {
4521 if (Py_UNICODE_ISUPPER(*s)) {
4522 *s = Py_UNICODE_TOLOWER(*s);
4523 status = 1;
4524 }
4525 s++;
4526 }
4527 return status;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004528}
4529
4530static
4531int fixtitle(PyUnicodeObject *self)
4532{
4533 register Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
4534 register Py_UNICODE *e;
4535 int previous_is_cased;
4536
4537 /* Shortcut for single character strings */
4538 if (PyUnicode_GET_SIZE(self) == 1) {
4539 Py_UNICODE ch = Py_UNICODE_TOTITLE(*p);
4540 if (*p != ch) {
4541 *p = ch;
4542 return 1;
4543 }
4544 else
4545 return 0;
4546 }
Tim Petersced69f82003-09-16 20:30:58 +00004547
Guido van Rossumd57fd912000-03-10 22:53:23 +00004548 e = p + PyUnicode_GET_SIZE(self);
4549 previous_is_cased = 0;
4550 for (; p < e; p++) {
4551 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00004552
Guido van Rossumd57fd912000-03-10 22:53:23 +00004553 if (previous_is_cased)
4554 *p = Py_UNICODE_TOLOWER(ch);
4555 else
4556 *p = Py_UNICODE_TOTITLE(ch);
Tim Petersced69f82003-09-16 20:30:58 +00004557
4558 if (Py_UNICODE_ISLOWER(ch) ||
4559 Py_UNICODE_ISUPPER(ch) ||
Guido van Rossumd57fd912000-03-10 22:53:23 +00004560 Py_UNICODE_ISTITLE(ch))
4561 previous_is_cased = 1;
4562 else
4563 previous_is_cased = 0;
4564 }
4565 return 1;
4566}
4567
Tim Peters8ce9f162004-08-27 01:49:32 +00004568PyObject *
4569PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004570{
Tim Peters8ce9f162004-08-27 01:49:32 +00004571 PyObject *internal_separator = NULL;
Skip Montanaro6543b452004-09-16 03:28:13 +00004572 const Py_UNICODE blank = ' ';
4573 const Py_UNICODE *sep = &blank;
Thomas Wouters477c8d52006-05-27 19:21:47 +00004574 Py_ssize_t seplen = 1;
Tim Peters05eba1f2004-08-27 21:32:02 +00004575 PyUnicodeObject *res = NULL; /* the result */
Thomas Wouters477c8d52006-05-27 19:21:47 +00004576 Py_ssize_t res_alloc = 100; /* # allocated bytes for string in res */
4577 Py_ssize_t res_used; /* # used bytes */
Tim Peters05eba1f2004-08-27 21:32:02 +00004578 Py_UNICODE *res_p; /* pointer to free byte in res's string area */
4579 PyObject *fseq; /* PySequence_Fast(seq) */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004580 Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */
Tim Peters8ce9f162004-08-27 01:49:32 +00004581 PyObject *item;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004582 Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004583
Tim Peters05eba1f2004-08-27 21:32:02 +00004584 fseq = PySequence_Fast(seq, "");
4585 if (fseq == NULL) {
Tim Peters05eba1f2004-08-27 21:32:02 +00004586 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +00004587 }
4588
Tim Peters91879ab2004-08-27 22:35:44 +00004589 /* Grrrr. A codec may be invoked to convert str objects to
4590 * Unicode, and so it's possible to call back into Python code
4591 * during PyUnicode_FromObject(), and so it's possible for a sick
4592 * codec to change the size of fseq (if seq is a list). Therefore
4593 * we have to keep refetching the size -- can't assume seqlen
4594 * is invariant.
4595 */
Tim Peters05eba1f2004-08-27 21:32:02 +00004596 seqlen = PySequence_Fast_GET_SIZE(fseq);
4597 /* If empty sequence, return u"". */
4598 if (seqlen == 0) {
4599 res = _PyUnicode_New(0); /* empty sequence; return u"" */
4600 goto Done;
4601 }
4602 /* If singleton sequence with an exact Unicode, return that. */
4603 if (seqlen == 1) {
4604 item = PySequence_Fast_GET_ITEM(fseq, 0);
4605 if (PyUnicode_CheckExact(item)) {
4606 Py_INCREF(item);
4607 res = (PyUnicodeObject *)item;
4608 goto Done;
4609 }
Tim Peters8ce9f162004-08-27 01:49:32 +00004610 }
4611
Tim Peters05eba1f2004-08-27 21:32:02 +00004612 /* At least two items to join, or one that isn't exact Unicode. */
4613 if (seqlen > 1) {
4614 /* Set up sep and seplen -- they're needed. */
4615 if (separator == NULL) {
4616 sep = &blank;
4617 seplen = 1;
4618 }
4619 else {
4620 internal_separator = PyUnicode_FromObject(separator);
4621 if (internal_separator == NULL)
4622 goto onError;
4623 sep = PyUnicode_AS_UNICODE(internal_separator);
4624 seplen = PyUnicode_GET_SIZE(internal_separator);
Tim Peters91879ab2004-08-27 22:35:44 +00004625 /* In case PyUnicode_FromObject() mutated seq. */
4626 seqlen = PySequence_Fast_GET_SIZE(fseq);
Tim Peters05eba1f2004-08-27 21:32:02 +00004627 }
4628 }
4629
4630 /* Get space. */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004631 res = _PyUnicode_New(res_alloc);
Tim Peters05eba1f2004-08-27 21:32:02 +00004632 if (res == NULL)
Tim Peters8ce9f162004-08-27 01:49:32 +00004633 goto onError;
Tim Peters05eba1f2004-08-27 21:32:02 +00004634 res_p = PyUnicode_AS_UNICODE(res);
4635 res_used = 0;
Tim Peters8ce9f162004-08-27 01:49:32 +00004636
Tim Peters05eba1f2004-08-27 21:32:02 +00004637 for (i = 0; i < seqlen; ++i) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00004638 Py_ssize_t itemlen;
4639 Py_ssize_t new_res_used;
Tim Peters05eba1f2004-08-27 21:32:02 +00004640
4641 item = PySequence_Fast_GET_ITEM(fseq, i);
4642 /* Convert item to Unicode. */
4643 if (! PyUnicode_Check(item) && ! PyString_Check(item)) {
4644 PyErr_Format(PyExc_TypeError,
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004645 "sequence item %zd: expected string or Unicode,"
Tim Peters05eba1f2004-08-27 21:32:02 +00004646 " %.80s found",
4647 i, item->ob_type->tp_name);
Tim Peters2cfe3682001-05-05 05:36:48 +00004648 goto onError;
Tim Peters8ce9f162004-08-27 01:49:32 +00004649 }
Tim Peters05eba1f2004-08-27 21:32:02 +00004650 item = PyUnicode_FromObject(item);
4651 if (item == NULL)
4652 goto onError;
4653 /* We own a reference to item from here on. */
Tim Petersced69f82003-09-16 20:30:58 +00004654
Tim Peters91879ab2004-08-27 22:35:44 +00004655 /* In case PyUnicode_FromObject() mutated seq. */
4656 seqlen = PySequence_Fast_GET_SIZE(fseq);
4657
Tim Peters8ce9f162004-08-27 01:49:32 +00004658 /* Make sure we have enough space for the separator and the item. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004659 itemlen = PyUnicode_GET_SIZE(item);
Tim Peters05eba1f2004-08-27 21:32:02 +00004660 new_res_used = res_used + itemlen;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004661 if (new_res_used < 0)
Tim Peters8ce9f162004-08-27 01:49:32 +00004662 goto Overflow;
Tim Peters05eba1f2004-08-27 21:32:02 +00004663 if (i < seqlen - 1) {
4664 new_res_used += seplen;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004665 if (new_res_used < 0)
Tim Peters05eba1f2004-08-27 21:32:02 +00004666 goto Overflow;
4667 }
4668 if (new_res_used > res_alloc) {
4669 /* double allocated size until it's big enough */
Tim Peters8ce9f162004-08-27 01:49:32 +00004670 do {
Tim Peters05eba1f2004-08-27 21:32:02 +00004671 res_alloc += res_alloc;
Thomas Wouters477c8d52006-05-27 19:21:47 +00004672 if (res_alloc <= 0)
Tim Peters8ce9f162004-08-27 01:49:32 +00004673 goto Overflow;
Tim Peters05eba1f2004-08-27 21:32:02 +00004674 } while (new_res_used > res_alloc);
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004675 if (_PyUnicode_Resize(&res, res_alloc) < 0) {
Marc-André Lemburg3508e302001-09-20 17:22:58 +00004676 Py_DECREF(item);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004677 goto onError;
Marc-André Lemburg3508e302001-09-20 17:22:58 +00004678 }
Tim Peters05eba1f2004-08-27 21:32:02 +00004679 res_p = PyUnicode_AS_UNICODE(res) + res_used;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004680 }
Tim Peters05eba1f2004-08-27 21:32:02 +00004681
4682 /* Copy item, and maybe the separator. */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004683 Py_UNICODE_COPY(res_p, PyUnicode_AS_UNICODE(item), itemlen);
Tim Peters05eba1f2004-08-27 21:32:02 +00004684 res_p += itemlen;
4685 if (i < seqlen - 1) {
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004686 Py_UNICODE_COPY(res_p, sep, seplen);
Tim Peters05eba1f2004-08-27 21:32:02 +00004687 res_p += seplen;
4688 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004689 Py_DECREF(item);
Tim Peters05eba1f2004-08-27 21:32:02 +00004690 res_used = new_res_used;
4691 }
Tim Peters8ce9f162004-08-27 01:49:32 +00004692
Tim Peters05eba1f2004-08-27 21:32:02 +00004693 /* Shrink res to match the used area; this probably can't fail,
4694 * but it's cheap to check.
4695 */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004696 if (_PyUnicode_Resize(&res, res_used) < 0)
Tim Peters8ce9f162004-08-27 01:49:32 +00004697 goto onError;
4698
4699 Done:
4700 Py_XDECREF(internal_separator);
Tim Peters05eba1f2004-08-27 21:32:02 +00004701 Py_DECREF(fseq);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004702 return (PyObject *)res;
4703
Tim Peters8ce9f162004-08-27 01:49:32 +00004704 Overflow:
4705 PyErr_SetString(PyExc_OverflowError,
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004706 "join() result is too long for a Python string");
Tim Peters8ce9f162004-08-27 01:49:32 +00004707 Py_DECREF(item);
4708 /* fall through */
4709
Guido van Rossumd57fd912000-03-10 22:53:23 +00004710 onError:
Tim Peters8ce9f162004-08-27 01:49:32 +00004711 Py_XDECREF(internal_separator);
Tim Peters05eba1f2004-08-27 21:32:02 +00004712 Py_DECREF(fseq);
Tim Peters8ce9f162004-08-27 01:49:32 +00004713 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004714 return NULL;
4715}
4716
Tim Petersced69f82003-09-16 20:30:58 +00004717static
4718PyUnicodeObject *pad(PyUnicodeObject *self,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004719 Py_ssize_t left,
4720 Py_ssize_t right,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004721 Py_UNICODE fill)
4722{
4723 PyUnicodeObject *u;
4724
4725 if (left < 0)
4726 left = 0;
4727 if (right < 0)
4728 right = 0;
4729
Tim Peters7a29bd52001-09-12 03:03:31 +00004730 if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004731 Py_INCREF(self);
4732 return self;
4733 }
4734
4735 u = _PyUnicode_New(left + self->length + right);
4736 if (u) {
4737 if (left)
4738 Py_UNICODE_FILL(u->str, fill, left);
4739 Py_UNICODE_COPY(u->str + left, self->str, self->length);
4740 if (right)
4741 Py_UNICODE_FILL(u->str + left + self->length, fill, right);
4742 }
4743
4744 return u;
4745}
4746
4747#define SPLIT_APPEND(data, left, right) \
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004748 str = PyUnicode_FromUnicode((data) + (left), (right) - (left)); \
Guido van Rossumd57fd912000-03-10 22:53:23 +00004749 if (!str) \
4750 goto onError; \
4751 if (PyList_Append(list, str)) { \
4752 Py_DECREF(str); \
4753 goto onError; \
4754 } \
4755 else \
4756 Py_DECREF(str);
4757
4758static
4759PyObject *split_whitespace(PyUnicodeObject *self,
4760 PyObject *list,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004761 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004762{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004763 register Py_ssize_t i;
4764 register Py_ssize_t j;
4765 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004766 PyObject *str;
4767
4768 for (i = j = 0; i < len; ) {
4769 /* find a token */
4770 while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
4771 i++;
4772 j = i;
4773 while (i < len && !Py_UNICODE_ISSPACE(self->str[i]))
4774 i++;
4775 if (j < i) {
4776 if (maxcount-- <= 0)
4777 break;
4778 SPLIT_APPEND(self->str, j, i);
4779 while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
4780 i++;
4781 j = i;
4782 }
4783 }
4784 if (j < len) {
4785 SPLIT_APPEND(self->str, j, len);
4786 }
4787 return list;
4788
4789 onError:
4790 Py_DECREF(list);
4791 return NULL;
4792}
4793
4794PyObject *PyUnicode_Splitlines(PyObject *string,
Guido van Rossum86662912000-04-11 15:38:46 +00004795 int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004796{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004797 register Py_ssize_t i;
4798 register Py_ssize_t j;
4799 Py_ssize_t len;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004800 PyObject *list;
4801 PyObject *str;
4802 Py_UNICODE *data;
4803
4804 string = PyUnicode_FromObject(string);
4805 if (string == NULL)
4806 return NULL;
4807 data = PyUnicode_AS_UNICODE(string);
4808 len = PyUnicode_GET_SIZE(string);
4809
Guido van Rossumd57fd912000-03-10 22:53:23 +00004810 list = PyList_New(0);
4811 if (!list)
4812 goto onError;
4813
4814 for (i = j = 0; i < len; ) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00004815 Py_ssize_t eol;
Tim Petersced69f82003-09-16 20:30:58 +00004816
Guido van Rossumd57fd912000-03-10 22:53:23 +00004817 /* Find a line and append it */
Thomas Wouters477c8d52006-05-27 19:21:47 +00004818 while (i < len && !BLOOM_LINEBREAK(data[i]))
Guido van Rossumd57fd912000-03-10 22:53:23 +00004819 i++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004820
4821 /* Skip the line break reading CRLF as one line break */
Guido van Rossum86662912000-04-11 15:38:46 +00004822 eol = i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004823 if (i < len) {
4824 if (data[i] == '\r' && i + 1 < len &&
4825 data[i+1] == '\n')
4826 i += 2;
4827 else
4828 i++;
Guido van Rossum86662912000-04-11 15:38:46 +00004829 if (keepends)
4830 eol = i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004831 }
Guido van Rossum86662912000-04-11 15:38:46 +00004832 SPLIT_APPEND(data, j, eol);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004833 j = i;
4834 }
4835 if (j < len) {
4836 SPLIT_APPEND(data, j, len);
4837 }
4838
4839 Py_DECREF(string);
4840 return list;
4841
4842 onError:
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00004843 Py_XDECREF(list);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004844 Py_DECREF(string);
4845 return NULL;
4846}
4847
Tim Petersced69f82003-09-16 20:30:58 +00004848static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004849PyObject *split_char(PyUnicodeObject *self,
4850 PyObject *list,
4851 Py_UNICODE ch,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004852 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004853{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004854 register Py_ssize_t i;
4855 register Py_ssize_t j;
4856 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004857 PyObject *str;
4858
4859 for (i = j = 0; i < len; ) {
4860 if (self->str[i] == ch) {
4861 if (maxcount-- <= 0)
4862 break;
4863 SPLIT_APPEND(self->str, j, i);
4864 i = j = i + 1;
4865 } else
4866 i++;
4867 }
4868 if (j <= len) {
4869 SPLIT_APPEND(self->str, j, len);
4870 }
4871 return list;
4872
4873 onError:
4874 Py_DECREF(list);
4875 return NULL;
4876}
4877
Tim Petersced69f82003-09-16 20:30:58 +00004878static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004879PyObject *split_substring(PyUnicodeObject *self,
4880 PyObject *list,
4881 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004882 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004883{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004884 register Py_ssize_t i;
4885 register Py_ssize_t j;
4886 Py_ssize_t len = self->length;
4887 Py_ssize_t sublen = substring->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004888 PyObject *str;
4889
Guido van Rossumcda4f9a2000-12-19 02:23:19 +00004890 for (i = j = 0; i <= len - sublen; ) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004891 if (Py_UNICODE_MATCH(self, i, substring)) {
4892 if (maxcount-- <= 0)
4893 break;
4894 SPLIT_APPEND(self->str, j, i);
4895 i = j = i + sublen;
4896 } else
4897 i++;
4898 }
4899 if (j <= len) {
4900 SPLIT_APPEND(self->str, j, len);
4901 }
4902 return list;
4903
4904 onError:
4905 Py_DECREF(list);
4906 return NULL;
4907}
4908
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004909static
4910PyObject *rsplit_whitespace(PyUnicodeObject *self,
4911 PyObject *list,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004912 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004913{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004914 register Py_ssize_t i;
4915 register Py_ssize_t j;
4916 Py_ssize_t len = self->length;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004917 PyObject *str;
4918
4919 for (i = j = len - 1; i >= 0; ) {
4920 /* find a token */
4921 while (i >= 0 && Py_UNICODE_ISSPACE(self->str[i]))
4922 i--;
4923 j = i;
4924 while (i >= 0 && !Py_UNICODE_ISSPACE(self->str[i]))
4925 i--;
4926 if (j > i) {
4927 if (maxcount-- <= 0)
4928 break;
Thomas Wouters477c8d52006-05-27 19:21:47 +00004929 SPLIT_APPEND(self->str, i + 1, j + 1);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004930 while (i >= 0 && Py_UNICODE_ISSPACE(self->str[i]))
4931 i--;
4932 j = i;
4933 }
4934 }
4935 if (j >= 0) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00004936 SPLIT_APPEND(self->str, 0, j + 1);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004937 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00004938 if (PyList_Reverse(list) < 0)
4939 goto onError;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004940 return list;
4941
4942 onError:
4943 Py_DECREF(list);
4944 return NULL;
4945}
4946
4947static
4948PyObject *rsplit_char(PyUnicodeObject *self,
4949 PyObject *list,
4950 Py_UNICODE ch,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004951 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004952{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004953 register Py_ssize_t i;
4954 register Py_ssize_t j;
4955 Py_ssize_t len = self->length;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004956 PyObject *str;
4957
4958 for (i = j = len - 1; i >= 0; ) {
4959 if (self->str[i] == ch) {
4960 if (maxcount-- <= 0)
4961 break;
Thomas Wouters477c8d52006-05-27 19:21:47 +00004962 SPLIT_APPEND(self->str, i + 1, j + 1);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004963 j = i = i - 1;
4964 } else
4965 i--;
4966 }
Hye-Shik Chang7fc4cf52003-12-23 09:10:16 +00004967 if (j >= -1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00004968 SPLIT_APPEND(self->str, 0, j + 1);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004969 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00004970 if (PyList_Reverse(list) < 0)
4971 goto onError;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004972 return list;
4973
4974 onError:
4975 Py_DECREF(list);
4976 return NULL;
4977}
4978
4979static
4980PyObject *rsplit_substring(PyUnicodeObject *self,
4981 PyObject *list,
4982 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004983 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004984{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004985 register Py_ssize_t i;
4986 register Py_ssize_t j;
4987 Py_ssize_t len = self->length;
4988 Py_ssize_t sublen = substring->length;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004989 PyObject *str;
4990
4991 for (i = len - sublen, j = len; i >= 0; ) {
4992 if (Py_UNICODE_MATCH(self, i, substring)) {
4993 if (maxcount-- <= 0)
4994 break;
Thomas Wouters477c8d52006-05-27 19:21:47 +00004995 SPLIT_APPEND(self->str, i + sublen, j);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004996 j = i;
4997 i -= sublen;
4998 } else
4999 i--;
5000 }
5001 if (j >= 0) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00005002 SPLIT_APPEND(self->str, 0, j);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005003 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00005004 if (PyList_Reverse(list) < 0)
5005 goto onError;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005006 return list;
5007
5008 onError:
5009 Py_DECREF(list);
5010 return NULL;
5011}
5012
Guido van Rossumd57fd912000-03-10 22:53:23 +00005013#undef SPLIT_APPEND
5014
5015static
5016PyObject *split(PyUnicodeObject *self,
5017 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005018 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005019{
5020 PyObject *list;
5021
5022 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005023 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005024
5025 list = PyList_New(0);
5026 if (!list)
5027 return NULL;
5028
5029 if (substring == NULL)
5030 return split_whitespace(self,list,maxcount);
5031
5032 else if (substring->length == 1)
5033 return split_char(self,list,substring->str[0],maxcount);
5034
5035 else if (substring->length == 0) {
5036 Py_DECREF(list);
5037 PyErr_SetString(PyExc_ValueError, "empty separator");
5038 return NULL;
5039 }
5040 else
5041 return split_substring(self,list,substring,maxcount);
5042}
5043
Tim Petersced69f82003-09-16 20:30:58 +00005044static
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005045PyObject *rsplit(PyUnicodeObject *self,
5046 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005047 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005048{
5049 PyObject *list;
5050
5051 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005052 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005053
5054 list = PyList_New(0);
5055 if (!list)
5056 return NULL;
5057
5058 if (substring == NULL)
5059 return rsplit_whitespace(self,list,maxcount);
5060
5061 else if (substring->length == 1)
5062 return rsplit_char(self,list,substring->str[0],maxcount);
5063
5064 else if (substring->length == 0) {
5065 Py_DECREF(list);
5066 PyErr_SetString(PyExc_ValueError, "empty separator");
5067 return NULL;
5068 }
5069 else
5070 return rsplit_substring(self,list,substring,maxcount);
5071}
5072
5073static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005074PyObject *replace(PyUnicodeObject *self,
5075 PyUnicodeObject *str1,
5076 PyUnicodeObject *str2,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005077 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005078{
5079 PyUnicodeObject *u;
5080
5081 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005082 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005083
Thomas Wouters477c8d52006-05-27 19:21:47 +00005084 if (str1->length == str2->length) {
5085 /* same length */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005086 Py_ssize_t i;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005087 if (str1->length == 1) {
5088 /* replace characters */
5089 Py_UNICODE u1, u2;
5090 if (!findchar(self->str, self->length, str1->str[0]))
5091 goto nothing;
5092 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
5093 if (!u)
5094 return NULL;
5095 Py_UNICODE_COPY(u->str, self->str, self->length);
5096 u1 = str1->str[0];
5097 u2 = str2->str[0];
5098 for (i = 0; i < u->length; i++)
5099 if (u->str[i] == u1) {
5100 if (--maxcount < 0)
5101 break;
5102 u->str[i] = u2;
5103 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005104 } else {
Thomas Wouters477c8d52006-05-27 19:21:47 +00005105 i = fastsearch(
5106 self->str, self->length, str1->str, str1->length, FAST_SEARCH
Guido van Rossumd57fd912000-03-10 22:53:23 +00005107 );
Thomas Wouters477c8d52006-05-27 19:21:47 +00005108 if (i < 0)
5109 goto nothing;
5110 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
5111 if (!u)
5112 return NULL;
5113 Py_UNICODE_COPY(u->str, self->str, self->length);
5114 while (i <= self->length - str1->length)
5115 if (Py_UNICODE_MATCH(self, i, str1)) {
5116 if (--maxcount < 0)
5117 break;
5118 Py_UNICODE_COPY(u->str+i, str2->str, str2->length);
5119 i += str1->length;
5120 } else
5121 i++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005122 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005123 } else {
Thomas Wouters477c8d52006-05-27 19:21:47 +00005124
5125 Py_ssize_t n, i, j, e;
5126 Py_ssize_t product, new_size, delta;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005127 Py_UNICODE *p;
5128
5129 /* replace strings */
Thomas Wouters477c8d52006-05-27 19:21:47 +00005130 n = stringlib_count(self->str, self->length, str1->str, str1->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005131 if (n > maxcount)
5132 n = maxcount;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005133 if (n == 0)
5134 goto nothing;
5135 /* new_size = self->length + n * (str2->length - str1->length)); */
5136 delta = (str2->length - str1->length);
5137 if (delta == 0) {
5138 new_size = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005139 } else {
Thomas Wouters477c8d52006-05-27 19:21:47 +00005140 product = n * (str2->length - str1->length);
5141 if ((product / (str2->length - str1->length)) != n) {
5142 PyErr_SetString(PyExc_OverflowError,
5143 "replace string is too long");
5144 return NULL;
5145 }
5146 new_size = self->length + product;
5147 if (new_size < 0) {
5148 PyErr_SetString(PyExc_OverflowError,
5149 "replace string is too long");
5150 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005151 }
5152 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00005153 u = _PyUnicode_New(new_size);
5154 if (!u)
5155 return NULL;
5156 i = 0;
5157 p = u->str;
5158 e = self->length - str1->length;
5159 if (str1->length > 0) {
5160 while (n-- > 0) {
5161 /* look for next match */
5162 j = i;
5163 while (j <= e) {
5164 if (Py_UNICODE_MATCH(self, j, str1))
5165 break;
5166 j++;
5167 }
5168 if (j > i) {
5169 if (j > e)
5170 break;
5171 /* copy unchanged part [i:j] */
5172 Py_UNICODE_COPY(p, self->str+i, j-i);
5173 p += j - i;
5174 }
5175 /* copy substitution string */
5176 if (str2->length > 0) {
5177 Py_UNICODE_COPY(p, str2->str, str2->length);
5178 p += str2->length;
5179 }
5180 i = j + str1->length;
5181 }
5182 if (i < self->length)
5183 /* copy tail [i:] */
5184 Py_UNICODE_COPY(p, self->str+i, self->length-i);
5185 } else {
5186 /* interleave */
5187 while (n > 0) {
5188 Py_UNICODE_COPY(p, str2->str, str2->length);
5189 p += str2->length;
5190 if (--n <= 0)
5191 break;
5192 *p++ = self->str[i++];
5193 }
5194 Py_UNICODE_COPY(p, self->str+i, self->length-i);
5195 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005196 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005197 return (PyObject *) u;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005198
5199nothing:
5200 /* nothing to replace; return original string (when possible) */
5201 if (PyUnicode_CheckExact(self)) {
5202 Py_INCREF(self);
5203 return (PyObject *) self;
5204 }
5205 return PyUnicode_FromUnicode(self->str, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005206}
5207
5208/* --- Unicode Object Methods --------------------------------------------- */
5209
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005210PyDoc_STRVAR(title__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005211"S.title() -> unicode\n\
5212\n\
5213Return a titlecased version of S, i.e. words start with title case\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005214characters, all remaining cased characters have lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005215
5216static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005217unicode_title(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005218{
Guido van Rossumd57fd912000-03-10 22:53:23 +00005219 return fixup(self, fixtitle);
5220}
5221
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005222PyDoc_STRVAR(capitalize__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005223"S.capitalize() -> unicode\n\
5224\n\
5225Return a capitalized version of S, i.e. make the first character\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005226have upper case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005227
5228static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005229unicode_capitalize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005230{
Guido van Rossumd57fd912000-03-10 22:53:23 +00005231 return fixup(self, fixcapitalize);
5232}
5233
5234#if 0
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005235PyDoc_STRVAR(capwords__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005236"S.capwords() -> unicode\n\
5237\n\
5238Apply .capitalize() to all words in S and return the result with\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005239normalized whitespace (all whitespace strings are replaced by ' ').");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005240
5241static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005242unicode_capwords(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005243{
5244 PyObject *list;
5245 PyObject *item;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005246 Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005247
Guido van Rossumd57fd912000-03-10 22:53:23 +00005248 /* Split into words */
5249 list = split(self, NULL, -1);
5250 if (!list)
5251 return NULL;
5252
5253 /* Capitalize each word */
5254 for (i = 0; i < PyList_GET_SIZE(list); i++) {
5255 item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
5256 fixcapitalize);
5257 if (item == NULL)
5258 goto onError;
5259 Py_DECREF(PyList_GET_ITEM(list, i));
5260 PyList_SET_ITEM(list, i, item);
5261 }
5262
5263 /* Join the words to form a new string */
5264 item = PyUnicode_Join(NULL, list);
5265
5266onError:
5267 Py_DECREF(list);
5268 return (PyObject *)item;
5269}
5270#endif
5271
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00005272/* Argument converter. Coerces to a single unicode character */
5273
5274static int
5275convert_uc(PyObject *obj, void *addr)
5276{
5277 Py_UNICODE *fillcharloc = (Py_UNICODE *)addr;
5278 PyObject *uniobj;
5279 Py_UNICODE *unistr;
5280
5281 uniobj = PyUnicode_FromObject(obj);
5282 if (uniobj == NULL) {
5283 PyErr_SetString(PyExc_TypeError,
5284 "The fill character cannot be converted to Unicode");
5285 return 0;
5286 }
5287 if (PyUnicode_GET_SIZE(uniobj) != 1) {
5288 PyErr_SetString(PyExc_TypeError,
5289 "The fill character must be exactly one character long");
5290 Py_DECREF(uniobj);
5291 return 0;
5292 }
5293 unistr = PyUnicode_AS_UNICODE(uniobj);
5294 *fillcharloc = unistr[0];
5295 Py_DECREF(uniobj);
5296 return 1;
5297}
5298
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005299PyDoc_STRVAR(center__doc__,
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00005300"S.center(width[, fillchar]) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005301\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00005302Return S centered in a Unicode string of length width. Padding is\n\
5303done using the specified fill character (default is a space)");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005304
5305static PyObject *
5306unicode_center(PyUnicodeObject *self, PyObject *args)
5307{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005308 Py_ssize_t marg, left;
5309 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00005310 Py_UNICODE fillchar = ' ';
Guido van Rossumd57fd912000-03-10 22:53:23 +00005311
Thomas Woutersde017742006-02-16 19:34:37 +00005312 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005313 return NULL;
5314
Tim Peters7a29bd52001-09-12 03:03:31 +00005315 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005316 Py_INCREF(self);
5317 return (PyObject*) self;
5318 }
5319
5320 marg = width - self->length;
5321 left = marg / 2 + (marg & width & 1);
5322
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00005323 return (PyObject*) pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005324}
5325
Marc-André Lemburge5034372000-08-08 08:04:29 +00005326#if 0
5327
5328/* This code should go into some future Unicode collation support
5329 module. The basic comparison should compare ordinals on a naive
Trent Mick20abf572000-08-12 22:14:34 +00005330 basis (this is what Java does and thus JPython too). */
Marc-André Lemburge5034372000-08-08 08:04:29 +00005331
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005332/* speedy UTF-16 code point order comparison */
5333/* gleaned from: */
5334/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
5335
Marc-André Lemburge12896e2000-07-07 17:51:08 +00005336static short utf16Fixup[32] =
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005337{
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005338 0, 0, 0, 0, 0, 0, 0, 0,
Tim Petersced69f82003-09-16 20:30:58 +00005339 0, 0, 0, 0, 0, 0, 0, 0,
5340 0, 0, 0, 0, 0, 0, 0, 0,
Marc-André Lemburge12896e2000-07-07 17:51:08 +00005341 0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005342};
5343
Guido van Rossumd57fd912000-03-10 22:53:23 +00005344static int
5345unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
5346{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005347 Py_ssize_t len1, len2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005348
Guido van Rossumd57fd912000-03-10 22:53:23 +00005349 Py_UNICODE *s1 = str1->str;
5350 Py_UNICODE *s2 = str2->str;
5351
5352 len1 = str1->length;
5353 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00005354
Guido van Rossumd57fd912000-03-10 22:53:23 +00005355 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00005356 Py_UNICODE c1, c2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005357
5358 c1 = *s1++;
5359 c2 = *s2++;
Fredrik Lundh45714e92001-06-26 16:39:36 +00005360
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005361 if (c1 > (1<<11) * 26)
5362 c1 += utf16Fixup[c1>>11];
5363 if (c2 > (1<<11) * 26)
5364 c2 += utf16Fixup[c2>>11];
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005365 /* now c1 and c2 are in UTF-32-compatible order */
Fredrik Lundh45714e92001-06-26 16:39:36 +00005366
5367 if (c1 != c2)
5368 return (c1 < c2) ? -1 : 1;
Tim Petersced69f82003-09-16 20:30:58 +00005369
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005370 len1--; len2--;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005371 }
5372
5373 return (len1 < len2) ? -1 : (len1 != len2);
5374}
5375
Marc-André Lemburge5034372000-08-08 08:04:29 +00005376#else
5377
5378static int
5379unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
5380{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005381 register Py_ssize_t len1, len2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00005382
5383 Py_UNICODE *s1 = str1->str;
5384 Py_UNICODE *s2 = str2->str;
5385
5386 len1 = str1->length;
5387 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00005388
Marc-André Lemburge5034372000-08-08 08:04:29 +00005389 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00005390 Py_UNICODE c1, c2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00005391
Fredrik Lundh45714e92001-06-26 16:39:36 +00005392 c1 = *s1++;
5393 c2 = *s2++;
5394
5395 if (c1 != c2)
5396 return (c1 < c2) ? -1 : 1;
5397
Marc-André Lemburge5034372000-08-08 08:04:29 +00005398 len1--; len2--;
5399 }
5400
5401 return (len1 < len2) ? -1 : (len1 != len2);
5402}
5403
5404#endif
5405
Guido van Rossumd57fd912000-03-10 22:53:23 +00005406int PyUnicode_Compare(PyObject *left,
5407 PyObject *right)
5408{
5409 PyUnicodeObject *u = NULL, *v = NULL;
5410 int result;
5411
5412 /* Coerce the two arguments */
5413 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
5414 if (u == NULL)
5415 goto onError;
5416 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
5417 if (v == NULL)
5418 goto onError;
5419
Thomas Wouters7e474022000-07-16 12:04:32 +00005420 /* Shortcut for empty or interned objects */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005421 if (v == u) {
5422 Py_DECREF(u);
5423 Py_DECREF(v);
5424 return 0;
5425 }
5426
5427 result = unicode_compare(u, v);
5428
5429 Py_DECREF(u);
5430 Py_DECREF(v);
5431 return result;
5432
5433onError:
5434 Py_XDECREF(u);
5435 Py_XDECREF(v);
5436 return -1;
5437}
5438
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00005439PyObject *PyUnicode_RichCompare(PyObject *left,
5440 PyObject *right,
5441 int op)
5442{
5443 int result;
5444
5445 result = PyUnicode_Compare(left, right);
5446 if (result == -1 && PyErr_Occurred())
5447 goto onError;
5448
5449 /* Convert the return value to a Boolean */
5450 switch (op) {
5451 case Py_EQ:
5452 result = (result == 0);
5453 break;
5454 case Py_NE:
5455 result = (result != 0);
5456 break;
5457 case Py_LE:
5458 result = (result <= 0);
5459 break;
5460 case Py_GE:
5461 result = (result >= 0);
5462 break;
5463 case Py_LT:
5464 result = (result == -1);
5465 break;
5466 case Py_GT:
5467 result = (result == 1);
5468 break;
5469 }
5470 return PyBool_FromLong(result);
5471
5472 onError:
5473
5474 /* Standard case
5475
5476 Type errors mean that PyUnicode_FromObject() could not convert
5477 one of the arguments (usually the right hand side) to Unicode,
5478 ie. we can't handle the comparison request. However, it is
5479 possible that the other object knows a comparison method, which
5480 is why we return Py_NotImplemented to give the other object a
5481 chance.
5482
5483 */
5484 if (PyErr_ExceptionMatches(PyExc_TypeError)) {
5485 PyErr_Clear();
5486 Py_INCREF(Py_NotImplemented);
5487 return Py_NotImplemented;
5488 }
5489 if (op != Py_EQ && op != Py_NE)
5490 return NULL;
5491
5492 /* Equality comparison.
5493
5494 This is a special case: we silence any PyExc_UnicodeDecodeError
5495 and instead turn it into a PyErr_UnicodeWarning.
5496
5497 */
5498 if (!PyErr_ExceptionMatches(PyExc_UnicodeDecodeError))
5499 return NULL;
5500 PyErr_Clear();
5501 if (PyErr_Warn(PyExc_UnicodeWarning,
5502 (op == Py_EQ) ?
5503 "Unicode equal comparison "
5504 "failed to convert both arguments to Unicode - "
5505 "interpreting them as being unequal" :
5506 "Unicode unequal comparison "
5507 "failed to convert both arguments to Unicode - "
5508 "interpreting them as being unequal"
5509 ) < 0)
5510 return NULL;
5511 result = (op == Py_NE);
5512 return PyBool_FromLong(result);
5513}
5514
Guido van Rossum403d68b2000-03-13 15:55:09 +00005515int PyUnicode_Contains(PyObject *container,
5516 PyObject *element)
5517{
Thomas Wouters477c8d52006-05-27 19:21:47 +00005518 PyObject *str, *sub;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005519 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00005520
5521 /* Coerce the two arguments */
Thomas Wouters477c8d52006-05-27 19:21:47 +00005522 sub = PyUnicode_FromObject(element);
5523 if (!sub) {
Marc-André Lemburg7c014682000-06-28 08:11:47 +00005524 PyErr_SetString(PyExc_TypeError,
Barry Warsaw817918c2002-08-06 16:58:21 +00005525 "'in <string>' requires string as left operand");
Thomas Wouters477c8d52006-05-27 19:21:47 +00005526 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +00005527 }
5528
Thomas Wouters477c8d52006-05-27 19:21:47 +00005529 str = PyUnicode_FromObject(container);
5530 if (!str) {
5531 Py_DECREF(sub);
5532 return -1;
5533 }
5534
5535 result = stringlib_contains_obj(str, sub);
5536
5537 Py_DECREF(str);
5538 Py_DECREF(sub);
5539
Guido van Rossum403d68b2000-03-13 15:55:09 +00005540 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00005541}
5542
Guido van Rossumd57fd912000-03-10 22:53:23 +00005543/* Concat to string or Unicode object giving a new Unicode object. */
5544
5545PyObject *PyUnicode_Concat(PyObject *left,
5546 PyObject *right)
5547{
5548 PyUnicodeObject *u = NULL, *v = NULL, *w;
5549
Guido van Rossum84d79dd2007-04-13 02:23:57 +00005550 if (PyBytes_Check(left) || PyBytes_Check(right))
5551 return PyBytes_Concat(left, right);
5552
Guido van Rossumd57fd912000-03-10 22:53:23 +00005553 /* Coerce the two arguments */
5554 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
5555 if (u == NULL)
5556 goto onError;
5557 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
5558 if (v == NULL)
5559 goto onError;
5560
5561 /* Shortcuts */
5562 if (v == unicode_empty) {
5563 Py_DECREF(v);
5564 return (PyObject *)u;
5565 }
5566 if (u == unicode_empty) {
5567 Py_DECREF(u);
5568 return (PyObject *)v;
5569 }
5570
5571 /* Concat the two Unicode strings */
5572 w = _PyUnicode_New(u->length + v->length);
5573 if (w == NULL)
5574 goto onError;
5575 Py_UNICODE_COPY(w->str, u->str, u->length);
5576 Py_UNICODE_COPY(w->str + u->length, v->str, v->length);
5577
5578 Py_DECREF(u);
5579 Py_DECREF(v);
5580 return (PyObject *)w;
5581
5582onError:
5583 Py_XDECREF(u);
5584 Py_XDECREF(v);
5585 return NULL;
5586}
5587
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005588PyDoc_STRVAR(count__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005589"S.count(sub[, start[, end]]) -> int\n\
5590\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00005591Return the number of non-overlapping occurrences of substring sub in\n\
5592Unicode string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005593interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005594
5595static PyObject *
5596unicode_count(PyUnicodeObject *self, PyObject *args)
5597{
5598 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005599 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005600 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005601 PyObject *result;
5602
Guido van Rossumb8872e62000-05-09 14:14:27 +00005603 if (!PyArg_ParseTuple(args, "O|O&O&:count", &substring,
5604 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005605 return NULL;
5606
5607 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Thomas Wouters477c8d52006-05-27 19:21:47 +00005608 (PyObject *)substring);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005609 if (substring == NULL)
5610 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +00005611
Thomas Wouters477c8d52006-05-27 19:21:47 +00005612 FIX_START_END(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005613
Thomas Wouters477c8d52006-05-27 19:21:47 +00005614 result = PyInt_FromSsize_t(
5615 stringlib_count(self->str + start, end - start,
5616 substring->str, substring->length)
5617 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00005618
5619 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00005620
Guido van Rossumd57fd912000-03-10 22:53:23 +00005621 return result;
5622}
5623
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005624PyDoc_STRVAR(encode__doc__,
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00005625"S.encode([encoding[,errors]]) -> string or unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005626\n\
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00005627Encodes S using the codec registered for encoding. encoding defaults\n\
5628to the default encoding. errors may be given to set a different error\n\
Fred Drakee4315f52000-05-09 19:53:39 +00005629handling scheme. Default is 'strict' meaning that encoding errors raise\n\
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005630a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
5631'xmlcharrefreplace' as well as any other name registered with\n\
5632codecs.register_error that can handle UnicodeEncodeErrors.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005633
5634static PyObject *
5635unicode_encode(PyUnicodeObject *self, PyObject *args)
5636{
5637 char *encoding = NULL;
5638 char *errors = NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00005639 PyObject *v;
5640
Guido van Rossumd57fd912000-03-10 22:53:23 +00005641 if (!PyArg_ParseTuple(args, "|ss:encode", &encoding, &errors))
5642 return NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00005643 v = PyUnicode_AsEncodedObject((PyObject *)self, encoding, errors);
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00005644 if (v == NULL)
5645 goto onError;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00005646 if (!PyBytes_Check(v)) {
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00005647 PyErr_Format(PyExc_TypeError,
Guido van Rossumf15a29f2007-05-04 00:41:39 +00005648 "encoder did not return a bytes object "
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00005649 "(type=%.400s)",
5650 v->ob_type->tp_name);
5651 Py_DECREF(v);
5652 return NULL;
5653 }
5654 return v;
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00005655
5656 onError:
5657 return NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00005658}
5659
5660PyDoc_STRVAR(decode__doc__,
5661"S.decode([encoding[,errors]]) -> string or unicode\n\
5662\n\
5663Decodes S using the codec registered for encoding. encoding defaults\n\
5664to the default encoding. errors may be given to set a different error\n\
5665handling scheme. Default is 'strict' meaning that encoding errors raise\n\
5666a UnicodeDecodeError. Other possible values are 'ignore' and 'replace'\n\
5667as well as any other name registerd with codecs.register_error that is\n\
5668able to handle UnicodeDecodeErrors.");
5669
5670static PyObject *
Marc-André Lemburg126b44c2004-07-10 12:04:20 +00005671unicode_decode(PyUnicodeObject *self, PyObject *args)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00005672{
5673 char *encoding = NULL;
5674 char *errors = NULL;
5675 PyObject *v;
5676
5677 if (!PyArg_ParseTuple(args, "|ss:decode", &encoding, &errors))
5678 return NULL;
5679 v = PyUnicode_AsDecodedObject((PyObject *)self, encoding, errors);
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00005680 if (v == NULL)
5681 goto onError;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00005682 if (!PyString_Check(v) && !PyUnicode_Check(v)) {
5683 PyErr_Format(PyExc_TypeError,
5684 "decoder did not return a string/unicode object "
5685 "(type=%.400s)",
5686 v->ob_type->tp_name);
5687 Py_DECREF(v);
5688 return NULL;
5689 }
5690 return v;
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00005691
5692 onError:
5693 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005694}
5695
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005696PyDoc_STRVAR(expandtabs__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005697"S.expandtabs([tabsize]) -> unicode\n\
5698\n\
5699Return a copy of S where all tab characters are expanded using spaces.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005700If tabsize is not given, a tab size of 8 characters is assumed.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005701
5702static PyObject*
5703unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
5704{
5705 Py_UNICODE *e;
5706 Py_UNICODE *p;
5707 Py_UNICODE *q;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005708 Py_ssize_t i, j;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005709 PyUnicodeObject *u;
5710 int tabsize = 8;
5711
5712 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
5713 return NULL;
5714
Thomas Wouters7e474022000-07-16 12:04:32 +00005715 /* First pass: determine size of output string */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005716 i = j = 0;
5717 e = self->str + self->length;
5718 for (p = self->str; p < e; p++)
5719 if (*p == '\t') {
5720 if (tabsize > 0)
5721 j += tabsize - (j % tabsize);
5722 }
5723 else {
5724 j++;
5725 if (*p == '\n' || *p == '\r') {
5726 i += j;
5727 j = 0;
5728 }
5729 }
5730
5731 /* Second pass: create output string and fill it */
5732 u = _PyUnicode_New(i + j);
5733 if (!u)
5734 return NULL;
5735
5736 j = 0;
5737 q = u->str;
5738
5739 for (p = self->str; p < e; p++)
5740 if (*p == '\t') {
5741 if (tabsize > 0) {
5742 i = tabsize - (j % tabsize);
5743 j += i;
5744 while (i--)
5745 *q++ = ' ';
5746 }
5747 }
5748 else {
5749 j++;
5750 *q++ = *p;
5751 if (*p == '\n' || *p == '\r')
5752 j = 0;
5753 }
5754
5755 return (PyObject*) u;
5756}
5757
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005758PyDoc_STRVAR(find__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005759"S.find(sub [,start [,end]]) -> int\n\
5760\n\
5761Return the lowest index in S where substring sub is found,\n\
5762such that sub is contained within s[start,end]. Optional\n\
5763arguments start and end are interpreted as in slice notation.\n\
5764\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005765Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005766
5767static PyObject *
5768unicode_find(PyUnicodeObject *self, PyObject *args)
5769{
Thomas Wouters477c8d52006-05-27 19:21:47 +00005770 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005771 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005772 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005773 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005774
Guido van Rossumb8872e62000-05-09 14:14:27 +00005775 if (!PyArg_ParseTuple(args, "O|O&O&:find", &substring,
5776 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005777 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005778 substring = PyUnicode_FromObject(substring);
5779 if (!substring)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005780 return NULL;
5781
Thomas Wouters477c8d52006-05-27 19:21:47 +00005782 result = stringlib_find_slice(
5783 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
5784 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
5785 start, end
5786 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00005787
5788 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00005789
5790 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005791}
5792
5793static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +00005794unicode_getitem(PyUnicodeObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005795{
5796 if (index < 0 || index >= self->length) {
5797 PyErr_SetString(PyExc_IndexError, "string index out of range");
5798 return NULL;
5799 }
5800
5801 return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1);
5802}
5803
5804static long
5805unicode_hash(PyUnicodeObject *self)
5806{
Fredrik Lundhdde61642000-07-10 18:27:47 +00005807 /* Since Unicode objects compare equal to their ASCII string
5808 counterparts, they should use the individual character values
5809 as basis for their hash value. This is needed to assure that
5810 strings and Unicode objects behave in the same way as
5811 dictionary keys. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005812
Martin v. Löwis18e16552006-02-15 17:27:45 +00005813 register Py_ssize_t len;
Fredrik Lundhdde61642000-07-10 18:27:47 +00005814 register Py_UNICODE *p;
5815 register long x;
5816
Guido van Rossumd57fd912000-03-10 22:53:23 +00005817 if (self->hash != -1)
5818 return self->hash;
Fredrik Lundhdde61642000-07-10 18:27:47 +00005819 len = PyUnicode_GET_SIZE(self);
5820 p = PyUnicode_AS_UNICODE(self);
5821 x = *p << 7;
5822 while (--len >= 0)
5823 x = (1000003*x) ^ *p++;
5824 x ^= PyUnicode_GET_SIZE(self);
5825 if (x == -1)
5826 x = -2;
5827 self->hash = x;
5828 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005829}
5830
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005831PyDoc_STRVAR(index__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005832"S.index(sub [,start [,end]]) -> int\n\
5833\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005834Like S.find() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005835
5836static PyObject *
5837unicode_index(PyUnicodeObject *self, PyObject *args)
5838{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005839 Py_ssize_t result;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005840 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005841 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005842 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005843
Guido van Rossumb8872e62000-05-09 14:14:27 +00005844 if (!PyArg_ParseTuple(args, "O|O&O&:index", &substring,
5845 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005846 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005847 substring = PyUnicode_FromObject(substring);
5848 if (!substring)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005849 return NULL;
5850
Thomas Wouters477c8d52006-05-27 19:21:47 +00005851 result = stringlib_find_slice(
5852 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
5853 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
5854 start, end
5855 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00005856
5857 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00005858
Guido van Rossumd57fd912000-03-10 22:53:23 +00005859 if (result < 0) {
5860 PyErr_SetString(PyExc_ValueError, "substring not found");
5861 return NULL;
5862 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00005863
Martin v. Löwis18e16552006-02-15 17:27:45 +00005864 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005865}
5866
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005867PyDoc_STRVAR(islower__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005868"S.islower() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005869\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00005870Return True if all cased characters in S are lowercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005871at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005872
5873static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005874unicode_islower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005875{
5876 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5877 register const Py_UNICODE *e;
5878 int cased;
5879
Guido van Rossumd57fd912000-03-10 22:53:23 +00005880 /* Shortcut for single character strings */
5881 if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005882 return PyBool_FromLong(Py_UNICODE_ISLOWER(*p));
Guido van Rossumd57fd912000-03-10 22:53:23 +00005883
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005884 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00005885 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005886 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005887
Guido van Rossumd57fd912000-03-10 22:53:23 +00005888 e = p + PyUnicode_GET_SIZE(self);
5889 cased = 0;
5890 for (; p < e; p++) {
5891 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00005892
Guido van Rossumd57fd912000-03-10 22:53:23 +00005893 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005894 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005895 else if (!cased && Py_UNICODE_ISLOWER(ch))
5896 cased = 1;
5897 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00005898 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005899}
5900
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005901PyDoc_STRVAR(isupper__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005902"S.isupper() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005903\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00005904Return True if all cased characters in S are uppercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005905at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005906
5907static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005908unicode_isupper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005909{
5910 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5911 register const Py_UNICODE *e;
5912 int cased;
5913
Guido van Rossumd57fd912000-03-10 22:53:23 +00005914 /* Shortcut for single character strings */
5915 if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005916 return PyBool_FromLong(Py_UNICODE_ISUPPER(*p) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005917
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005918 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00005919 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005920 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005921
Guido van Rossumd57fd912000-03-10 22:53:23 +00005922 e = p + PyUnicode_GET_SIZE(self);
5923 cased = 0;
5924 for (; p < e; p++) {
5925 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00005926
Guido van Rossumd57fd912000-03-10 22:53:23 +00005927 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005928 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005929 else if (!cased && Py_UNICODE_ISUPPER(ch))
5930 cased = 1;
5931 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00005932 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005933}
5934
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005935PyDoc_STRVAR(istitle__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005936"S.istitle() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005937\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00005938Return True if S is a titlecased string and there is at least one\n\
5939character in S, i.e. upper- and titlecase characters may only\n\
5940follow uncased characters and lowercase characters only cased ones.\n\
5941Return False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005942
5943static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005944unicode_istitle(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005945{
5946 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5947 register const Py_UNICODE *e;
5948 int cased, previous_is_cased;
5949
Guido van Rossumd57fd912000-03-10 22:53:23 +00005950 /* Shortcut for single character strings */
5951 if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005952 return PyBool_FromLong((Py_UNICODE_ISTITLE(*p) != 0) ||
5953 (Py_UNICODE_ISUPPER(*p) != 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +00005954
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005955 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00005956 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005957 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005958
Guido van Rossumd57fd912000-03-10 22:53:23 +00005959 e = p + PyUnicode_GET_SIZE(self);
5960 cased = 0;
5961 previous_is_cased = 0;
5962 for (; p < e; p++) {
5963 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00005964
Guido van Rossumd57fd912000-03-10 22:53:23 +00005965 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
5966 if (previous_is_cased)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005967 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005968 previous_is_cased = 1;
5969 cased = 1;
5970 }
5971 else if (Py_UNICODE_ISLOWER(ch)) {
5972 if (!previous_is_cased)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005973 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005974 previous_is_cased = 1;
5975 cased = 1;
5976 }
5977 else
5978 previous_is_cased = 0;
5979 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00005980 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005981}
5982
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005983PyDoc_STRVAR(isspace__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005984"S.isspace() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005985\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00005986Return True if all characters in S are whitespace\n\
5987and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005988
5989static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005990unicode_isspace(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005991{
5992 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5993 register const Py_UNICODE *e;
5994
Guido van Rossumd57fd912000-03-10 22:53:23 +00005995 /* Shortcut for single character strings */
5996 if (PyUnicode_GET_SIZE(self) == 1 &&
5997 Py_UNICODE_ISSPACE(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005998 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005999
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006000 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006001 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006002 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006003
Guido van Rossumd57fd912000-03-10 22:53:23 +00006004 e = p + PyUnicode_GET_SIZE(self);
6005 for (; p < e; p++) {
6006 if (!Py_UNICODE_ISSPACE(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006007 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006008 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006009 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006010}
6011
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006012PyDoc_STRVAR(isalpha__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006013"S.isalpha() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006014\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006015Return True if all characters in S are alphabetic\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006016and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006017
6018static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006019unicode_isalpha(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006020{
6021 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6022 register const Py_UNICODE *e;
6023
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006024 /* Shortcut for single character strings */
6025 if (PyUnicode_GET_SIZE(self) == 1 &&
6026 Py_UNICODE_ISALPHA(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006027 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006028
6029 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006030 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006031 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006032
6033 e = p + PyUnicode_GET_SIZE(self);
6034 for (; p < e; p++) {
6035 if (!Py_UNICODE_ISALPHA(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006036 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006037 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006038 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006039}
6040
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006041PyDoc_STRVAR(isalnum__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006042"S.isalnum() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006043\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006044Return True if all characters in S are alphanumeric\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006045and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006046
6047static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006048unicode_isalnum(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006049{
6050 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6051 register const Py_UNICODE *e;
6052
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006053 /* Shortcut for single character strings */
6054 if (PyUnicode_GET_SIZE(self) == 1 &&
6055 Py_UNICODE_ISALNUM(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006056 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006057
6058 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006059 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006060 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006061
6062 e = p + PyUnicode_GET_SIZE(self);
6063 for (; p < e; p++) {
6064 if (!Py_UNICODE_ISALNUM(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006065 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006066 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006067 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006068}
6069
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006070PyDoc_STRVAR(isdecimal__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006071"S.isdecimal() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006072\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00006073Return True if there are only decimal characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006074False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006075
6076static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006077unicode_isdecimal(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006078{
6079 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6080 register const Py_UNICODE *e;
6081
Guido van Rossumd57fd912000-03-10 22:53:23 +00006082 /* Shortcut for single character strings */
6083 if (PyUnicode_GET_SIZE(self) == 1 &&
6084 Py_UNICODE_ISDECIMAL(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006085 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006086
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006087 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006088 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006089 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006090
Guido van Rossumd57fd912000-03-10 22:53:23 +00006091 e = p + PyUnicode_GET_SIZE(self);
6092 for (; p < e; p++) {
6093 if (!Py_UNICODE_ISDECIMAL(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006094 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006095 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006096 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006097}
6098
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006099PyDoc_STRVAR(isdigit__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006100"S.isdigit() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006101\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006102Return True if all characters in S are digits\n\
6103and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006104
6105static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006106unicode_isdigit(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006107{
6108 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6109 register const Py_UNICODE *e;
6110
Guido van Rossumd57fd912000-03-10 22:53:23 +00006111 /* Shortcut for single character strings */
6112 if (PyUnicode_GET_SIZE(self) == 1 &&
6113 Py_UNICODE_ISDIGIT(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006114 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006115
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006116 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006117 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006118 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006119
Guido van Rossumd57fd912000-03-10 22:53:23 +00006120 e = p + PyUnicode_GET_SIZE(self);
6121 for (; p < e; p++) {
6122 if (!Py_UNICODE_ISDIGIT(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006123 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006124 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006125 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006126}
6127
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006128PyDoc_STRVAR(isnumeric__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006129"S.isnumeric() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006130\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00006131Return True if there are only numeric characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006132False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006133
6134static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006135unicode_isnumeric(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006136{
6137 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6138 register const Py_UNICODE *e;
6139
Guido van Rossumd57fd912000-03-10 22:53:23 +00006140 /* Shortcut for single character strings */
6141 if (PyUnicode_GET_SIZE(self) == 1 &&
6142 Py_UNICODE_ISNUMERIC(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006143 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006144
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006145 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006146 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006147 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006148
Guido van Rossumd57fd912000-03-10 22:53:23 +00006149 e = p + PyUnicode_GET_SIZE(self);
6150 for (; p < e; p++) {
6151 if (!Py_UNICODE_ISNUMERIC(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006152 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006153 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006154 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006155}
6156
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006157PyDoc_STRVAR(join__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006158"S.join(sequence) -> unicode\n\
6159\n\
6160Return a string which is the concatenation of the strings in the\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006161sequence. The separator between elements is S.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006162
6163static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006164unicode_join(PyObject *self, PyObject *data)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006165{
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006166 return PyUnicode_Join(self, data);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006167}
6168
Martin v. Löwis18e16552006-02-15 17:27:45 +00006169static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +00006170unicode_length(PyUnicodeObject *self)
6171{
6172 return self->length;
6173}
6174
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006175PyDoc_STRVAR(ljust__doc__,
Hye-Shik Chang974ed7c2004-06-02 16:49:17 +00006176"S.ljust(width[, fillchar]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006177\n\
6178Return S left justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006179done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006180
6181static PyObject *
6182unicode_ljust(PyUnicodeObject *self, PyObject *args)
6183{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006184 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006185 Py_UNICODE fillchar = ' ';
6186
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006187 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006188 return NULL;
6189
Tim Peters7a29bd52001-09-12 03:03:31 +00006190 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006191 Py_INCREF(self);
6192 return (PyObject*) self;
6193 }
6194
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006195 return (PyObject*) pad(self, 0, width - self->length, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006196}
6197
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006198PyDoc_STRVAR(lower__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006199"S.lower() -> unicode\n\
6200\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006201Return a copy of the string S converted to lowercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006202
6203static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006204unicode_lower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006205{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006206 return fixup(self, fixlower);
6207}
6208
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006209#define LEFTSTRIP 0
6210#define RIGHTSTRIP 1
6211#define BOTHSTRIP 2
6212
6213/* Arrays indexed by above */
6214static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
6215
6216#define STRIPNAME(i) (stripformat[i]+3)
6217
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006218/* externally visible for str.strip(unicode) */
6219PyObject *
6220_PyUnicode_XStrip(PyUnicodeObject *self, int striptype, PyObject *sepobj)
6221{
6222 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
Martin v. Löwis18e16552006-02-15 17:27:45 +00006223 Py_ssize_t len = PyUnicode_GET_SIZE(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006224 Py_UNICODE *sep = PyUnicode_AS_UNICODE(sepobj);
Martin v. Löwis18e16552006-02-15 17:27:45 +00006225 Py_ssize_t seplen = PyUnicode_GET_SIZE(sepobj);
6226 Py_ssize_t i, j;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006227
Thomas Wouters477c8d52006-05-27 19:21:47 +00006228 BLOOM_MASK sepmask = make_bloom_mask(sep, seplen);
6229
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006230 i = 0;
6231 if (striptype != RIGHTSTRIP) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00006232 while (i < len && BLOOM_MEMBER(sepmask, s[i], sep, seplen)) {
6233 i++;
6234 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006235 }
6236
6237 j = len;
6238 if (striptype != LEFTSTRIP) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00006239 do {
6240 j--;
6241 } while (j >= i && BLOOM_MEMBER(sepmask, s[j], sep, seplen));
6242 j++;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006243 }
6244
6245 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00006246 Py_INCREF(self);
6247 return (PyObject*)self;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006248 }
6249 else
Thomas Wouters477c8d52006-05-27 19:21:47 +00006250 return PyUnicode_FromUnicode(s+i, j-i);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006251}
6252
Guido van Rossumd57fd912000-03-10 22:53:23 +00006253
6254static PyObject *
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006255do_strip(PyUnicodeObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006256{
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006257 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
Martin v. Löwis18e16552006-02-15 17:27:45 +00006258 Py_ssize_t len = PyUnicode_GET_SIZE(self), i, j;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006259
6260 i = 0;
6261 if (striptype != RIGHTSTRIP) {
6262 while (i < len && Py_UNICODE_ISSPACE(s[i])) {
6263 i++;
6264 }
6265 }
6266
6267 j = len;
6268 if (striptype != LEFTSTRIP) {
6269 do {
6270 j--;
6271 } while (j >= i && Py_UNICODE_ISSPACE(s[j]));
6272 j++;
6273 }
6274
6275 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
6276 Py_INCREF(self);
6277 return (PyObject*)self;
6278 }
6279 else
6280 return PyUnicode_FromUnicode(s+i, j-i);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006281}
6282
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006283
6284static PyObject *
6285do_argstrip(PyUnicodeObject *self, int striptype, PyObject *args)
6286{
6287 PyObject *sep = NULL;
6288
6289 if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
6290 return NULL;
6291
6292 if (sep != NULL && sep != Py_None) {
6293 if (PyUnicode_Check(sep))
6294 return _PyUnicode_XStrip(self, striptype, sep);
6295 else if (PyString_Check(sep)) {
6296 PyObject *res;
6297 sep = PyUnicode_FromObject(sep);
6298 if (sep==NULL)
6299 return NULL;
6300 res = _PyUnicode_XStrip(self, striptype, sep);
6301 Py_DECREF(sep);
6302 return res;
6303 }
6304 else {
6305 PyErr_Format(PyExc_TypeError,
6306 "%s arg must be None, unicode or str",
6307 STRIPNAME(striptype));
6308 return NULL;
6309 }
6310 }
6311
6312 return do_strip(self, striptype);
6313}
6314
6315
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006316PyDoc_STRVAR(strip__doc__,
Neal Norwitzffe33b72003-04-10 22:35:32 +00006317"S.strip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006318\n\
6319Return a copy of the string S with leading and trailing\n\
6320whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00006321If chars is given and not None, remove characters in chars instead.\n\
6322If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006323
6324static PyObject *
6325unicode_strip(PyUnicodeObject *self, PyObject *args)
6326{
6327 if (PyTuple_GET_SIZE(args) == 0)
6328 return do_strip(self, BOTHSTRIP); /* Common case */
6329 else
6330 return do_argstrip(self, BOTHSTRIP, args);
6331}
6332
6333
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006334PyDoc_STRVAR(lstrip__doc__,
Neal Norwitzffe33b72003-04-10 22:35:32 +00006335"S.lstrip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006336\n\
6337Return a copy of the string S with leading whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00006338If chars is given and not None, remove characters in chars instead.\n\
6339If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006340
6341static PyObject *
6342unicode_lstrip(PyUnicodeObject *self, PyObject *args)
6343{
6344 if (PyTuple_GET_SIZE(args) == 0)
6345 return do_strip(self, LEFTSTRIP); /* Common case */
6346 else
6347 return do_argstrip(self, LEFTSTRIP, args);
6348}
6349
6350
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006351PyDoc_STRVAR(rstrip__doc__,
Neal Norwitzffe33b72003-04-10 22:35:32 +00006352"S.rstrip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006353\n\
6354Return a copy of the string S with trailing whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00006355If chars is given and not None, remove characters in chars instead.\n\
6356If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006357
6358static PyObject *
6359unicode_rstrip(PyUnicodeObject *self, PyObject *args)
6360{
6361 if (PyTuple_GET_SIZE(args) == 0)
6362 return do_strip(self, RIGHTSTRIP); /* Common case */
6363 else
6364 return do_argstrip(self, RIGHTSTRIP, args);
6365}
6366
6367
Guido van Rossumd57fd912000-03-10 22:53:23 +00006368static PyObject*
Martin v. Löwis18e16552006-02-15 17:27:45 +00006369unicode_repeat(PyUnicodeObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006370{
6371 PyUnicodeObject *u;
6372 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006373 Py_ssize_t nchars;
Tim Peters8f422462000-09-09 06:13:41 +00006374 size_t nbytes;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006375
6376 if (len < 0)
6377 len = 0;
6378
Tim Peters7a29bd52001-09-12 03:03:31 +00006379 if (len == 1 && PyUnicode_CheckExact(str)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006380 /* no repeat, return original string */
6381 Py_INCREF(str);
6382 return (PyObject*) str;
6383 }
Tim Peters8f422462000-09-09 06:13:41 +00006384
6385 /* ensure # of chars needed doesn't overflow int and # of bytes
6386 * needed doesn't overflow size_t
6387 */
6388 nchars = len * str->length;
6389 if (len && nchars / len != str->length) {
6390 PyErr_SetString(PyExc_OverflowError,
6391 "repeated string is too long");
6392 return NULL;
6393 }
6394 nbytes = (nchars + 1) * sizeof(Py_UNICODE);
6395 if (nbytes / sizeof(Py_UNICODE) != (size_t)(nchars + 1)) {
6396 PyErr_SetString(PyExc_OverflowError,
6397 "repeated string is too long");
6398 return NULL;
6399 }
6400 u = _PyUnicode_New(nchars);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006401 if (!u)
6402 return NULL;
6403
6404 p = u->str;
6405
Thomas Wouters477c8d52006-05-27 19:21:47 +00006406 if (str->length == 1 && len > 0) {
6407 Py_UNICODE_FILL(p, str->str[0], len);
6408 } else {
6409 Py_ssize_t done = 0; /* number of characters copied this far */
6410 if (done < nchars) {
6411 Py_UNICODE_COPY(p, str->str, str->length);
6412 done = str->length;
6413 }
6414 while (done < nchars) {
6415 int n = (done <= nchars-done) ? done : nchars-done;
6416 Py_UNICODE_COPY(p+done, p, n);
6417 done += n;
6418 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006419 }
6420
6421 return (PyObject*) u;
6422}
6423
6424PyObject *PyUnicode_Replace(PyObject *obj,
6425 PyObject *subobj,
6426 PyObject *replobj,
Martin v. Löwis18e16552006-02-15 17:27:45 +00006427 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006428{
6429 PyObject *self;
6430 PyObject *str1;
6431 PyObject *str2;
6432 PyObject *result;
6433
6434 self = PyUnicode_FromObject(obj);
6435 if (self == NULL)
6436 return NULL;
6437 str1 = PyUnicode_FromObject(subobj);
6438 if (str1 == NULL) {
6439 Py_DECREF(self);
6440 return NULL;
6441 }
6442 str2 = PyUnicode_FromObject(replobj);
6443 if (str2 == NULL) {
6444 Py_DECREF(self);
6445 Py_DECREF(str1);
6446 return NULL;
6447 }
Tim Petersced69f82003-09-16 20:30:58 +00006448 result = replace((PyUnicodeObject *)self,
6449 (PyUnicodeObject *)str1,
6450 (PyUnicodeObject *)str2,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006451 maxcount);
6452 Py_DECREF(self);
6453 Py_DECREF(str1);
6454 Py_DECREF(str2);
6455 return result;
6456}
6457
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006458PyDoc_STRVAR(replace__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006459"S.replace (old, new[, maxsplit]) -> unicode\n\
6460\n\
6461Return a copy of S with all occurrences of substring\n\
6462old replaced by new. If the optional argument maxsplit is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006463given, only the first maxsplit occurrences are replaced.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006464
6465static PyObject*
6466unicode_replace(PyUnicodeObject *self, PyObject *args)
6467{
6468 PyUnicodeObject *str1;
6469 PyUnicodeObject *str2;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006470 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006471 PyObject *result;
6472
Martin v. Löwis18e16552006-02-15 17:27:45 +00006473 if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006474 return NULL;
6475 str1 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str1);
6476 if (str1 == NULL)
6477 return NULL;
6478 str2 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str2);
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00006479 if (str2 == NULL) {
6480 Py_DECREF(str1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006481 return NULL;
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00006482 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006483
6484 result = replace(self, str1, str2, maxcount);
6485
6486 Py_DECREF(str1);
6487 Py_DECREF(str2);
6488 return result;
6489}
6490
6491static
6492PyObject *unicode_repr(PyObject *unicode)
6493{
6494 return unicodeescape_string(PyUnicode_AS_UNICODE(unicode),
6495 PyUnicode_GET_SIZE(unicode),
6496 1);
6497}
6498
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006499PyDoc_STRVAR(rfind__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006500"S.rfind(sub [,start [,end]]) -> int\n\
6501\n\
6502Return the highest index in S where substring sub is found,\n\
6503such that sub is contained within s[start,end]. Optional\n\
6504arguments start and end are interpreted as in slice notation.\n\
6505\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006506Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006507
6508static PyObject *
6509unicode_rfind(PyUnicodeObject *self, PyObject *args)
6510{
Thomas Wouters477c8d52006-05-27 19:21:47 +00006511 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006512 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006513 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006514 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006515
Guido van Rossumb8872e62000-05-09 14:14:27 +00006516 if (!PyArg_ParseTuple(args, "O|O&O&:rfind", &substring,
6517 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006518 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006519 substring = PyUnicode_FromObject(substring);
6520 if (!substring)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006521 return NULL;
6522
Thomas Wouters477c8d52006-05-27 19:21:47 +00006523 result = stringlib_rfind_slice(
6524 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
6525 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
6526 start, end
6527 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006528
6529 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00006530
6531 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006532}
6533
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006534PyDoc_STRVAR(rindex__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006535"S.rindex(sub [,start [,end]]) -> int\n\
6536\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006537Like S.rfind() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006538
6539static PyObject *
6540unicode_rindex(PyUnicodeObject *self, PyObject *args)
6541{
Thomas Wouters477c8d52006-05-27 19:21:47 +00006542 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006543 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006544 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006545 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006546
Guido van Rossumb8872e62000-05-09 14:14:27 +00006547 if (!PyArg_ParseTuple(args, "O|O&O&:rindex", &substring,
6548 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006549 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006550 substring = PyUnicode_FromObject(substring);
6551 if (!substring)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006552 return NULL;
6553
Thomas Wouters477c8d52006-05-27 19:21:47 +00006554 result = stringlib_rfind_slice(
6555 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
6556 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
6557 start, end
6558 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006559
6560 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00006561
Guido van Rossumd57fd912000-03-10 22:53:23 +00006562 if (result < 0) {
6563 PyErr_SetString(PyExc_ValueError, "substring not found");
6564 return NULL;
6565 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00006566 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006567}
6568
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006569PyDoc_STRVAR(rjust__doc__,
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006570"S.rjust(width[, fillchar]) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006571\n\
6572Return S right justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006573done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006574
6575static PyObject *
6576unicode_rjust(PyUnicodeObject *self, PyObject *args)
6577{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006578 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006579 Py_UNICODE fillchar = ' ';
6580
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006581 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006582 return NULL;
6583
Tim Peters7a29bd52001-09-12 03:03:31 +00006584 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006585 Py_INCREF(self);
6586 return (PyObject*) self;
6587 }
6588
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006589 return (PyObject*) pad(self, width - self->length, 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006590}
6591
Guido van Rossumd57fd912000-03-10 22:53:23 +00006592static PyObject*
Martin v. Löwis18e16552006-02-15 17:27:45 +00006593unicode_slice(PyUnicodeObject *self, Py_ssize_t start, Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006594{
6595 /* standard clamping */
6596 if (start < 0)
6597 start = 0;
6598 if (end < 0)
6599 end = 0;
6600 if (end > self->length)
6601 end = self->length;
Tim Peters7a29bd52001-09-12 03:03:31 +00006602 if (start == 0 && end == self->length && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006603 /* full slice, return original string */
6604 Py_INCREF(self);
6605 return (PyObject*) self;
6606 }
6607 if (start > end)
6608 start = end;
6609 /* copy slice */
6610 return (PyObject*) PyUnicode_FromUnicode(self->str + start,
6611 end - start);
6612}
6613
6614PyObject *PyUnicode_Split(PyObject *s,
6615 PyObject *sep,
Martin v. Löwis18e16552006-02-15 17:27:45 +00006616 Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006617{
6618 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00006619
Guido van Rossumd57fd912000-03-10 22:53:23 +00006620 s = PyUnicode_FromObject(s);
6621 if (s == NULL)
6622 return NULL;
6623 if (sep != NULL) {
6624 sep = PyUnicode_FromObject(sep);
6625 if (sep == NULL) {
6626 Py_DECREF(s);
6627 return NULL;
6628 }
6629 }
6630
6631 result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
6632
6633 Py_DECREF(s);
6634 Py_XDECREF(sep);
6635 return result;
6636}
6637
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006638PyDoc_STRVAR(split__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006639"S.split([sep [,maxsplit]]) -> list of strings\n\
6640\n\
6641Return a list of the words in S, using sep as the\n\
6642delimiter string. If maxsplit is given, at most maxsplit\n\
Thomas Hellerca0d2cb2004-09-15 11:41:32 +00006643splits are done. If sep is not specified or is None,\n\
Walter Dörwald782afc52004-09-14 09:40:45 +00006644any whitespace string is a separator.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006645
6646static PyObject*
6647unicode_split(PyUnicodeObject *self, PyObject *args)
6648{
6649 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006650 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006651
Martin v. Löwis18e16552006-02-15 17:27:45 +00006652 if (!PyArg_ParseTuple(args, "|On:split", &substring, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006653 return NULL;
6654
6655 if (substring == Py_None)
6656 return split(self, NULL, maxcount);
6657 else if (PyUnicode_Check(substring))
6658 return split(self, (PyUnicodeObject *)substring, maxcount);
6659 else
6660 return PyUnicode_Split((PyObject *)self, substring, maxcount);
6661}
6662
Thomas Wouters477c8d52006-05-27 19:21:47 +00006663PyObject *
6664PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
6665{
6666 PyObject* str_obj;
6667 PyObject* sep_obj;
6668 PyObject* out;
6669
6670 str_obj = PyUnicode_FromObject(str_in);
6671 if (!str_obj)
6672 return NULL;
6673 sep_obj = PyUnicode_FromObject(sep_in);
6674 if (!sep_obj) {
6675 Py_DECREF(str_obj);
6676 return NULL;
6677 }
6678
6679 out = stringlib_partition(
6680 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
6681 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
6682 );
6683
6684 Py_DECREF(sep_obj);
6685 Py_DECREF(str_obj);
6686
6687 return out;
6688}
6689
6690
6691PyObject *
6692PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
6693{
6694 PyObject* str_obj;
6695 PyObject* sep_obj;
6696 PyObject* out;
6697
6698 str_obj = PyUnicode_FromObject(str_in);
6699 if (!str_obj)
6700 return NULL;
6701 sep_obj = PyUnicode_FromObject(sep_in);
6702 if (!sep_obj) {
6703 Py_DECREF(str_obj);
6704 return NULL;
6705 }
6706
6707 out = stringlib_rpartition(
6708 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
6709 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
6710 );
6711
6712 Py_DECREF(sep_obj);
6713 Py_DECREF(str_obj);
6714
6715 return out;
6716}
6717
6718PyDoc_STRVAR(partition__doc__,
6719"S.partition(sep) -> (head, sep, tail)\n\
6720\n\
6721Searches for the separator sep in S, and returns the part before it,\n\
6722the separator itself, and the part after it. If the separator is not\n\
6723found, returns S and two empty strings.");
6724
6725static PyObject*
6726unicode_partition(PyUnicodeObject *self, PyObject *separator)
6727{
6728 return PyUnicode_Partition((PyObject *)self, separator);
6729}
6730
6731PyDoc_STRVAR(rpartition__doc__,
Thomas Wouters89f507f2006-12-13 04:49:30 +00006732"S.rpartition(sep) -> (tail, sep, head)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00006733\n\
6734Searches for the separator sep in S, starting at the end of S, and returns\n\
6735the part before it, the separator itself, and the part after it. If the\n\
Thomas Wouters89f507f2006-12-13 04:49:30 +00006736separator is not found, returns two empty strings and S.");
Thomas Wouters477c8d52006-05-27 19:21:47 +00006737
6738static PyObject*
6739unicode_rpartition(PyUnicodeObject *self, PyObject *separator)
6740{
6741 return PyUnicode_RPartition((PyObject *)self, separator);
6742}
6743
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006744PyObject *PyUnicode_RSplit(PyObject *s,
6745 PyObject *sep,
Martin v. Löwis18e16552006-02-15 17:27:45 +00006746 Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006747{
6748 PyObject *result;
6749
6750 s = PyUnicode_FromObject(s);
6751 if (s == NULL)
6752 return NULL;
6753 if (sep != NULL) {
6754 sep = PyUnicode_FromObject(sep);
6755 if (sep == NULL) {
6756 Py_DECREF(s);
6757 return NULL;
6758 }
6759 }
6760
6761 result = rsplit((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
6762
6763 Py_DECREF(s);
6764 Py_XDECREF(sep);
6765 return result;
6766}
6767
6768PyDoc_STRVAR(rsplit__doc__,
6769"S.rsplit([sep [,maxsplit]]) -> list of strings\n\
6770\n\
6771Return a list of the words in S, using sep as the\n\
6772delimiter string, starting at the end of the string and\n\
6773working to the front. If maxsplit is given, at most maxsplit\n\
6774splits are done. If sep is not specified, any whitespace string\n\
6775is a separator.");
6776
6777static PyObject*
6778unicode_rsplit(PyUnicodeObject *self, PyObject *args)
6779{
6780 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006781 Py_ssize_t maxcount = -1;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006782
Martin v. Löwis18e16552006-02-15 17:27:45 +00006783 if (!PyArg_ParseTuple(args, "|On:rsplit", &substring, &maxcount))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006784 return NULL;
6785
6786 if (substring == Py_None)
6787 return rsplit(self, NULL, maxcount);
6788 else if (PyUnicode_Check(substring))
6789 return rsplit(self, (PyUnicodeObject *)substring, maxcount);
6790 else
6791 return PyUnicode_RSplit((PyObject *)self, substring, maxcount);
6792}
6793
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006794PyDoc_STRVAR(splitlines__doc__,
Guido van Rossum86662912000-04-11 15:38:46 +00006795"S.splitlines([keepends]]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006796\n\
6797Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +00006798Line breaks are not included in the resulting list unless keepends\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006799is given and true.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006800
6801static PyObject*
6802unicode_splitlines(PyUnicodeObject *self, PyObject *args)
6803{
Guido van Rossum86662912000-04-11 15:38:46 +00006804 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006805
Guido van Rossum86662912000-04-11 15:38:46 +00006806 if (!PyArg_ParseTuple(args, "|i:splitlines", &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006807 return NULL;
6808
Guido van Rossum86662912000-04-11 15:38:46 +00006809 return PyUnicode_Splitlines((PyObject *)self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006810}
6811
6812static
Guido van Rossumf15a29f2007-05-04 00:41:39 +00006813PyObject *unicode_str(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006814{
Guido van Rossumf15a29f2007-05-04 00:41:39 +00006815 PyObject *res = _PyUnicode_AsDefaultEncodedString(self, NULL);
6816 Py_XINCREF(res);
6817 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006818}
6819
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006820PyDoc_STRVAR(swapcase__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006821"S.swapcase() -> unicode\n\
6822\n\
6823Return a copy of S with uppercase characters converted to lowercase\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006824and vice versa.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006825
6826static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006827unicode_swapcase(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006828{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006829 return fixup(self, fixswapcase);
6830}
6831
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006832PyDoc_STRVAR(translate__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006833"S.translate(table) -> unicode\n\
6834\n\
6835Return a copy of the string S, where all characters have been mapped\n\
6836through the given translation table, which must be a mapping of\n\
Walter Dörwald5c1ee172002-09-04 20:31:32 +00006837Unicode ordinals to Unicode ordinals, Unicode strings or None.\n\
6838Unmapped characters are left untouched. Characters mapped to None\n\
6839are deleted.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006840
6841static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006842unicode_translate(PyUnicodeObject *self, PyObject *table)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006843{
Tim Petersced69f82003-09-16 20:30:58 +00006844 return PyUnicode_TranslateCharmap(self->str,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006845 self->length,
Tim Petersced69f82003-09-16 20:30:58 +00006846 table,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006847 "ignore");
6848}
6849
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006850PyDoc_STRVAR(upper__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006851"S.upper() -> unicode\n\
6852\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006853Return a copy of S converted to uppercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006854
6855static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006856unicode_upper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006857{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006858 return fixup(self, fixupper);
6859}
6860
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006861PyDoc_STRVAR(zfill__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006862"S.zfill(width) -> unicode\n\
6863\n\
6864Pad a numeric string x with zeros on the left, to fill a field\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006865of the specified width. The string x is never truncated.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006866
6867static PyObject *
6868unicode_zfill(PyUnicodeObject *self, PyObject *args)
6869{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006870 Py_ssize_t fill;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006871 PyUnicodeObject *u;
6872
Martin v. Löwis18e16552006-02-15 17:27:45 +00006873 Py_ssize_t width;
6874 if (!PyArg_ParseTuple(args, "n:zfill", &width))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006875 return NULL;
6876
6877 if (self->length >= width) {
Walter Dörwald0fe940c2002-04-15 18:42:15 +00006878 if (PyUnicode_CheckExact(self)) {
6879 Py_INCREF(self);
6880 return (PyObject*) self;
6881 }
6882 else
6883 return PyUnicode_FromUnicode(
6884 PyUnicode_AS_UNICODE(self),
6885 PyUnicode_GET_SIZE(self)
6886 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006887 }
6888
6889 fill = width - self->length;
6890
6891 u = pad(self, fill, 0, '0');
6892
Walter Dörwald068325e2002-04-15 13:36:47 +00006893 if (u == NULL)
6894 return NULL;
6895
Guido van Rossumd57fd912000-03-10 22:53:23 +00006896 if (u->str[fill] == '+' || u->str[fill] == '-') {
6897 /* move sign to beginning of string */
6898 u->str[0] = u->str[fill];
6899 u->str[fill] = '0';
6900 }
6901
6902 return (PyObject*) u;
6903}
Guido van Rossumd57fd912000-03-10 22:53:23 +00006904
6905#if 0
6906static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006907unicode_freelistsize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006908{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006909 return PyInt_FromLong(unicode_freelist_size);
6910}
6911#endif
6912
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006913PyDoc_STRVAR(startswith__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006914"S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006915\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00006916Return True if S starts with the specified prefix, False otherwise.\n\
6917With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006918With optional end, stop comparing S at that position.\n\
6919prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006920
6921static PyObject *
6922unicode_startswith(PyUnicodeObject *self,
6923 PyObject *args)
6924{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006925 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006926 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006927 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006928 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006929 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006930
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006931 if (!PyArg_ParseTuple(args, "O|O&O&:startswith", &subobj,
Guido van Rossumb8872e62000-05-09 14:14:27 +00006932 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006933 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006934 if (PyTuple_Check(subobj)) {
6935 Py_ssize_t i;
6936 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
6937 substring = (PyUnicodeObject *)PyUnicode_FromObject(
6938 PyTuple_GET_ITEM(subobj, i));
6939 if (substring == NULL)
6940 return NULL;
6941 result = tailmatch(self, substring, start, end, -1);
6942 Py_DECREF(substring);
6943 if (result) {
6944 Py_RETURN_TRUE;
6945 }
6946 }
6947 /* nothing matched */
6948 Py_RETURN_FALSE;
6949 }
6950 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006951 if (substring == NULL)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006952 return NULL;
6953 result = tailmatch(self, substring, start, end, -1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006954 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006955 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006956}
6957
6958
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006959PyDoc_STRVAR(endswith__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006960"S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006961\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00006962Return True if S ends with the specified suffix, False otherwise.\n\
6963With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006964With optional end, stop comparing S at that position.\n\
6965suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006966
6967static PyObject *
6968unicode_endswith(PyUnicodeObject *self,
6969 PyObject *args)
6970{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006971 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006972 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006973 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006974 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006975 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006976
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006977 if (!PyArg_ParseTuple(args, "O|O&O&:endswith", &subobj,
6978 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006979 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006980 if (PyTuple_Check(subobj)) {
6981 Py_ssize_t i;
6982 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
6983 substring = (PyUnicodeObject *)PyUnicode_FromObject(
6984 PyTuple_GET_ITEM(subobj, i));
6985 if (substring == NULL)
6986 return NULL;
6987 result = tailmatch(self, substring, start, end, +1);
6988 Py_DECREF(substring);
6989 if (result) {
6990 Py_RETURN_TRUE;
6991 }
6992 }
6993 Py_RETURN_FALSE;
6994 }
6995 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006996 if (substring == NULL)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006997 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006998
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006999 result = tailmatch(self, substring, start, end, +1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007000 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007001 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007002}
7003
7004
Guido van Rossum5d9113d2003-01-29 17:58:45 +00007005
7006static PyObject *
7007unicode_getnewargs(PyUnicodeObject *v)
7008{
7009 return Py_BuildValue("(u#)", v->str, v->length);
7010}
7011
7012
Guido van Rossumd57fd912000-03-10 22:53:23 +00007013static PyMethodDef unicode_methods[] = {
7014
7015 /* Order is according to common usage: often used methods should
7016 appear first, since lookup is done sequentially. */
7017
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007018 {"encode", (PyCFunction) unicode_encode, METH_VARARGS, encode__doc__},
7019 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
7020 {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007021 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007022 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
7023 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
7024 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
7025 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
7026 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
7027 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
7028 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +00007029 {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007030 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
7031 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
7032 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007033 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00007034 {"decode", (PyCFunction) unicode_decode, METH_VARARGS, decode__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007035/* {"maketrans", (PyCFunction) unicode_maketrans, METH_VARARGS, maketrans__doc__}, */
7036 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
7037 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
7038 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007039 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +00007040 {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007041 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS, splitlines__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007042 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007043 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
7044 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
7045 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
7046 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
7047 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
7048 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
7049 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
7050 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
7051 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
7052 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
7053 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
7054 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
7055 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
7056 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007057 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
Walter Dörwald068325e2002-04-15 13:36:47 +00007058#if 0
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007059 {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
Guido van Rossumd57fd912000-03-10 22:53:23 +00007060#endif
7061
7062#if 0
7063 /* This one is just used for debugging the implementation. */
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007064 {"freelistsize", (PyCFunction) unicode_freelistsize, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00007065#endif
7066
Guido van Rossum5d9113d2003-01-29 17:58:45 +00007067 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00007068 {NULL, NULL}
7069};
7070
Neil Schemenauerce30bc92002-11-18 16:10:18 +00007071static PyObject *
7072unicode_mod(PyObject *v, PyObject *w)
7073{
7074 if (!PyUnicode_Check(v)) {
7075 Py_INCREF(Py_NotImplemented);
7076 return Py_NotImplemented;
7077 }
7078 return PyUnicode_Format(v, w);
7079}
7080
7081static PyNumberMethods unicode_as_number = {
7082 0, /*nb_add*/
7083 0, /*nb_subtract*/
7084 0, /*nb_multiply*/
Neil Schemenauerce30bc92002-11-18 16:10:18 +00007085 unicode_mod, /*nb_remainder*/
7086};
7087
Guido van Rossumd57fd912000-03-10 22:53:23 +00007088static PySequenceMethods unicode_as_sequence = {
Martin v. Löwis18e16552006-02-15 17:27:45 +00007089 (lenfunc) unicode_length, /* sq_length */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007090 PyUnicode_Concat, /* sq_concat */
Martin v. Löwis18e16552006-02-15 17:27:45 +00007091 (ssizeargfunc) unicode_repeat, /* sq_repeat */
7092 (ssizeargfunc) unicode_getitem, /* sq_item */
7093 (ssizessizeargfunc) unicode_slice, /* sq_slice */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007094 0, /* sq_ass_item */
7095 0, /* sq_ass_slice */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007096 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007097};
7098
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007099static PyObject*
7100unicode_subscript(PyUnicodeObject* self, PyObject* item)
7101{
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00007102 if (PyIndex_Check(item)) {
7103 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007104 if (i == -1 && PyErr_Occurred())
7105 return NULL;
7106 if (i < 0)
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007107 i += PyUnicode_GET_SIZE(self);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007108 return unicode_getitem(self, i);
7109 } else if (PySlice_Check(item)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00007110 Py_ssize_t start, stop, step, slicelength, cur, i;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007111 Py_UNICODE* source_buf;
7112 Py_UNICODE* result_buf;
7113 PyObject* result;
7114
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007115 if (PySlice_GetIndicesEx((PySliceObject*)item, PyUnicode_GET_SIZE(self),
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007116 &start, &stop, &step, &slicelength) < 0) {
7117 return NULL;
7118 }
7119
7120 if (slicelength <= 0) {
7121 return PyUnicode_FromUnicode(NULL, 0);
7122 } else {
7123 source_buf = PyUnicode_AS_UNICODE((PyObject*)self);
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007124 result_buf = (Py_UNICODE *)PyMem_MALLOC(slicelength*
7125 sizeof(Py_UNICODE));
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007126
7127 if (result_buf == NULL)
7128 return PyErr_NoMemory();
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007129
7130 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
7131 result_buf[i] = source_buf[cur];
7132 }
Tim Petersced69f82003-09-16 20:30:58 +00007133
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007134 result = PyUnicode_FromUnicode(result_buf, slicelength);
7135 PyMem_FREE(result_buf);
7136 return result;
7137 }
7138 } else {
7139 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
7140 return NULL;
7141 }
7142}
7143
7144static PyMappingMethods unicode_as_mapping = {
Martin v. Löwis18e16552006-02-15 17:27:45 +00007145 (lenfunc)unicode_length, /* mp_length */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007146 (binaryfunc)unicode_subscript, /* mp_subscript */
7147 (objobjargproc)0, /* mp_ass_subscript */
7148};
7149
Martin v. Löwis18e16552006-02-15 17:27:45 +00007150static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +00007151unicode_buffer_getreadbuf(PyUnicodeObject *self,
Martin v. Löwis18e16552006-02-15 17:27:45 +00007152 Py_ssize_t index,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007153 const void **ptr)
7154{
7155 if (index != 0) {
7156 PyErr_SetString(PyExc_SystemError,
7157 "accessing non-existent unicode segment");
7158 return -1;
7159 }
7160 *ptr = (void *) self->str;
7161 return PyUnicode_GET_DATA_SIZE(self);
7162}
7163
Martin v. Löwis18e16552006-02-15 17:27:45 +00007164static Py_ssize_t
7165unicode_buffer_getwritebuf(PyUnicodeObject *self, Py_ssize_t index,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007166 const void **ptr)
7167{
7168 PyErr_SetString(PyExc_TypeError,
Neal Norwitz20e72132002-06-13 21:25:17 +00007169 "cannot use unicode as modifiable buffer");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007170 return -1;
7171}
7172
7173static int
7174unicode_buffer_getsegcount(PyUnicodeObject *self,
Martin v. Löwis18e16552006-02-15 17:27:45 +00007175 Py_ssize_t *lenp)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007176{
7177 if (lenp)
7178 *lenp = PyUnicode_GET_DATA_SIZE(self);
7179 return 1;
7180}
7181
Martin v. Löwiseb079f12006-02-16 14:32:27 +00007182static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +00007183unicode_buffer_getcharbuf(PyUnicodeObject *self,
Martin v. Löwis18e16552006-02-15 17:27:45 +00007184 Py_ssize_t index,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007185 const void **ptr)
7186{
7187 PyObject *str;
Tim Petersced69f82003-09-16 20:30:58 +00007188
Guido van Rossumd57fd912000-03-10 22:53:23 +00007189 if (index != 0) {
7190 PyErr_SetString(PyExc_SystemError,
7191 "accessing non-existent unicode segment");
7192 return -1;
7193 }
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00007194 str = _PyUnicode_AsDefaultEncodedString((PyObject *)self, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007195 if (str == NULL)
7196 return -1;
7197 *ptr = (void *) PyString_AS_STRING(str);
7198 return PyString_GET_SIZE(str);
7199}
7200
7201/* Helpers for PyUnicode_Format() */
7202
7203static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +00007204getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007205{
Martin v. Löwis18e16552006-02-15 17:27:45 +00007206 Py_ssize_t argidx = *p_argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007207 if (argidx < arglen) {
7208 (*p_argidx)++;
7209 if (arglen < 0)
7210 return args;
7211 else
7212 return PyTuple_GetItem(args, argidx);
7213 }
7214 PyErr_SetString(PyExc_TypeError,
7215 "not enough arguments for format string");
7216 return NULL;
7217}
7218
7219#define F_LJUST (1<<0)
7220#define F_SIGN (1<<1)
7221#define F_BLANK (1<<2)
7222#define F_ALT (1<<3)
7223#define F_ZERO (1<<4)
7224
Martin v. Löwis18e16552006-02-15 17:27:45 +00007225static Py_ssize_t
Neal Norwitzfc76d632006-01-10 06:03:13 +00007226strtounicode(Py_UNICODE *buffer, const char *charbuffer)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007227{
Martin v. Löwis18e16552006-02-15 17:27:45 +00007228 register Py_ssize_t i;
7229 Py_ssize_t len = strlen(charbuffer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007230 for (i = len - 1; i >= 0; i--)
7231 buffer[i] = (Py_UNICODE) charbuffer[i];
7232
Guido van Rossumd57fd912000-03-10 22:53:23 +00007233 return len;
7234}
7235
Neal Norwitzfc76d632006-01-10 06:03:13 +00007236static int
7237doubletounicode(Py_UNICODE *buffer, size_t len, const char *format, double x)
7238{
Tim Peters15231542006-02-16 01:08:01 +00007239 Py_ssize_t result;
7240
Neal Norwitzfc76d632006-01-10 06:03:13 +00007241 PyOS_ascii_formatd((char *)buffer, len, format, x);
Tim Peters15231542006-02-16 01:08:01 +00007242 result = strtounicode(buffer, (char *)buffer);
7243 return Py_SAFE_DOWNCAST(result, Py_ssize_t, int);
Neal Norwitzfc76d632006-01-10 06:03:13 +00007244}
7245
7246static int
7247longtounicode(Py_UNICODE *buffer, size_t len, const char *format, long x)
7248{
Tim Peters15231542006-02-16 01:08:01 +00007249 Py_ssize_t result;
7250
Neal Norwitzfc76d632006-01-10 06:03:13 +00007251 PyOS_snprintf((char *)buffer, len, format, x);
Tim Peters15231542006-02-16 01:08:01 +00007252 result = strtounicode(buffer, (char *)buffer);
7253 return Py_SAFE_DOWNCAST(result, Py_ssize_t, int);
Neal Norwitzfc76d632006-01-10 06:03:13 +00007254}
7255
Guido van Rossum078151d2002-08-11 04:24:12 +00007256/* XXX To save some code duplication, formatfloat/long/int could have been
7257 shared with stringobject.c, converting from 8-bit to Unicode after the
7258 formatting is done. */
7259
Guido van Rossumd57fd912000-03-10 22:53:23 +00007260static int
7261formatfloat(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007262 size_t buflen,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007263 int flags,
7264 int prec,
7265 int type,
7266 PyObject *v)
7267{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007268 /* fmt = '%#.' + `prec` + `type`
7269 worst case length = 3 + 10 (len of INT_MAX) + 1 = 14 (use 20)*/
Guido van Rossumd57fd912000-03-10 22:53:23 +00007270 char fmt[20];
7271 double x;
Tim Petersced69f82003-09-16 20:30:58 +00007272
Guido van Rossumd57fd912000-03-10 22:53:23 +00007273 x = PyFloat_AsDouble(v);
7274 if (x == -1.0 && PyErr_Occurred())
7275 return -1;
7276 if (prec < 0)
7277 prec = 6;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007278 if (type == 'f' && (fabs(x) / 1e25) >= 1e25)
7279 type = 'g';
Marc-André Lemburg79f57832002-12-29 19:44:06 +00007280 /* Worst case length calc to ensure no buffer overrun:
7281
7282 'g' formats:
7283 fmt = %#.<prec>g
7284 buf = '-' + [0-9]*prec + '.' + 'e+' + (longest exp
7285 for any double rep.)
7286 len = 1 + prec + 1 + 2 + 5 = 9 + prec
7287
7288 'f' formats:
7289 buf = '-' + [0-9]*x + '.' + [0-9]*prec (with x < 50)
7290 len = 1 + 50 + 1 + prec = 52 + prec
7291
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007292 If prec=0 the effective precision is 1 (the leading digit is
Tim Petersced69f82003-09-16 20:30:58 +00007293 always given), therefore increase the length by one.
Marc-André Lemburg79f57832002-12-29 19:44:06 +00007294
7295 */
7296 if ((type == 'g' && buflen <= (size_t)10 + (size_t)prec) ||
7297 (type == 'f' && buflen <= (size_t)53 + (size_t)prec)) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007298 PyErr_SetString(PyExc_OverflowError,
Marc-André Lemburg79f57832002-12-29 19:44:06 +00007299 "formatted float is too long (precision too large?)");
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007300 return -1;
7301 }
Marc-André Lemburg79f57832002-12-29 19:44:06 +00007302 PyOS_snprintf(fmt, sizeof(fmt), "%%%s.%d%c",
7303 (flags&F_ALT) ? "#" : "",
7304 prec, type);
Neal Norwitzfc76d632006-01-10 06:03:13 +00007305 return doubletounicode(buf, buflen, fmt, x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007306}
7307
Tim Peters38fd5b62000-09-21 05:43:11 +00007308static PyObject*
7309formatlong(PyObject *val, int flags, int prec, int type)
7310{
7311 char *buf;
7312 int i, len;
7313 PyObject *str; /* temporary string object. */
7314 PyUnicodeObject *result;
7315
7316 str = _PyString_FormatLong(val, flags, prec, type, &buf, &len);
7317 if (!str)
7318 return NULL;
7319 result = _PyUnicode_New(len);
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00007320 if (!result) {
7321 Py_DECREF(str);
7322 return NULL;
7323 }
Tim Peters38fd5b62000-09-21 05:43:11 +00007324 for (i = 0; i < len; i++)
7325 result->str[i] = buf[i];
7326 result->str[len] = 0;
7327 Py_DECREF(str);
7328 return (PyObject*)result;
7329}
7330
Guido van Rossumd57fd912000-03-10 22:53:23 +00007331static int
7332formatint(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007333 size_t buflen,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007334 int flags,
7335 int prec,
7336 int type,
7337 PyObject *v)
7338{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007339 /* fmt = '%#.' + `prec` + 'l' + `type`
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00007340 * worst case length = 3 + 19 (worst len of INT_MAX on 64-bit machine)
7341 * + 1 + 1
7342 * = 24
7343 */
Tim Peters38fd5b62000-09-21 05:43:11 +00007344 char fmt[64]; /* plenty big enough! */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00007345 char *sign;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007346 long x;
7347
7348 x = PyInt_AsLong(v);
7349 if (x == -1 && PyErr_Occurred())
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00007350 return -1;
Guido van Rossum6c9e1302003-11-29 23:52:13 +00007351 if (x < 0 && type == 'u') {
7352 type = 'd';
Guido van Rossum078151d2002-08-11 04:24:12 +00007353 }
Guido van Rossum6c9e1302003-11-29 23:52:13 +00007354 if (x < 0 && (type == 'x' || type == 'X' || type == 'o'))
7355 sign = "-";
7356 else
7357 sign = "";
Guido van Rossumd57fd912000-03-10 22:53:23 +00007358 if (prec < 0)
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00007359 prec = 1;
7360
Guido van Rossum6c9e1302003-11-29 23:52:13 +00007361 /* buf = '+'/'-'/'' + '0'/'0x'/'' + '[0-9]'*max(prec, len(x in octal))
7362 * worst case buf = '-0x' + [0-9]*prec, where prec >= 11
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00007363 */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00007364 if (buflen <= 14 || buflen <= (size_t)3 + (size_t)prec) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007365 PyErr_SetString(PyExc_OverflowError,
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00007366 "formatted integer is too long (precision too large?)");
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007367 return -1;
7368 }
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00007369
7370 if ((flags & F_ALT) &&
7371 (type == 'x' || type == 'X')) {
Tim Petersced69f82003-09-16 20:30:58 +00007372 /* When converting under %#x or %#X, there are a number
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00007373 * of issues that cause pain:
7374 * - when 0 is being converted, the C standard leaves off
7375 * the '0x' or '0X', which is inconsistent with other
7376 * %#x/%#X conversions and inconsistent with Python's
7377 * hex() function
7378 * - there are platforms that violate the standard and
7379 * convert 0 with the '0x' or '0X'
7380 * (Metrowerks, Compaq Tru64)
7381 * - there are platforms that give '0x' when converting
Tim Petersced69f82003-09-16 20:30:58 +00007382 * under %#X, but convert 0 in accordance with the
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00007383 * standard (OS/2 EMX)
Tim Petersced69f82003-09-16 20:30:58 +00007384 *
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00007385 * We can achieve the desired consistency by inserting our
7386 * own '0x' or '0X' prefix, and substituting %x/%X in place
7387 * of %#x/%#X.
7388 *
7389 * Note that this is the same approach as used in
7390 * formatint() in stringobject.c
Andrew MacIntyrec4874392002-02-26 11:36:35 +00007391 */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00007392 PyOS_snprintf(fmt, sizeof(fmt), "%s0%c%%.%dl%c",
7393 sign, type, prec, type);
Andrew MacIntyrec4874392002-02-26 11:36:35 +00007394 }
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00007395 else {
Guido van Rossum6c9e1302003-11-29 23:52:13 +00007396 PyOS_snprintf(fmt, sizeof(fmt), "%s%%%s.%dl%c",
7397 sign, (flags&F_ALT) ? "#" : "",
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00007398 prec, type);
Tim Petersb3d8d1f2001-04-28 05:38:26 +00007399 }
Guido van Rossum6c9e1302003-11-29 23:52:13 +00007400 if (sign[0])
Neal Norwitzfc76d632006-01-10 06:03:13 +00007401 return longtounicode(buf, buflen, fmt, -x);
Guido van Rossum6c9e1302003-11-29 23:52:13 +00007402 else
Neal Norwitzfc76d632006-01-10 06:03:13 +00007403 return longtounicode(buf, buflen, fmt, x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007404}
7405
7406static int
7407formatchar(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007408 size_t buflen,
7409 PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007410{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007411 /* presume that the buffer is at least 2 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00007412 if (PyUnicode_Check(v)) {
7413 if (PyUnicode_GET_SIZE(v) != 1)
7414 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007415 buf[0] = PyUnicode_AS_UNICODE(v)[0];
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00007416 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007417
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00007418 else if (PyString_Check(v)) {
Tim Petersced69f82003-09-16 20:30:58 +00007419 if (PyString_GET_SIZE(v) != 1)
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00007420 goto onError;
7421 buf[0] = (Py_UNICODE)PyString_AS_STRING(v)[0];
7422 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007423
7424 else {
7425 /* Integer input truncated to a character */
7426 long x;
7427 x = PyInt_AsLong(v);
7428 if (x == -1 && PyErr_Occurred())
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00007429 goto onError;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00007430#ifdef Py_UNICODE_WIDE
7431 if (x < 0 || x > 0x10ffff) {
Walter Dörwald44f527f2003-04-02 16:37:24 +00007432 PyErr_SetString(PyExc_OverflowError,
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00007433 "%c arg not in range(0x110000) "
7434 "(wide Python build)");
7435 return -1;
7436 }
7437#else
7438 if (x < 0 || x > 0xffff) {
Walter Dörwald44f527f2003-04-02 16:37:24 +00007439 PyErr_SetString(PyExc_OverflowError,
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00007440 "%c arg not in range(0x10000) "
7441 "(narrow Python build)");
7442 return -1;
7443 }
7444#endif
7445 buf[0] = (Py_UNICODE) x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007446 }
7447 buf[1] = '\0';
7448 return 1;
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00007449
7450 onError:
7451 PyErr_SetString(PyExc_TypeError,
7452 "%c requires int or char");
7453 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007454}
7455
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007456/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
7457
7458 FORMATBUFLEN is the length of the buffer in which the floats, ints, &
7459 chars are formatted. XXX This is a magic number. Each formatting
7460 routine does bounds checking to ensure no overflow, but a better
7461 solution may be to malloc a buffer of appropriate size for each
7462 format. For now, the current solution is sufficient.
7463*/
7464#define FORMATBUFLEN (size_t)120
7465
Guido van Rossumd57fd912000-03-10 22:53:23 +00007466PyObject *PyUnicode_Format(PyObject *format,
7467 PyObject *args)
7468{
7469 Py_UNICODE *fmt, *res;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007470 Py_ssize_t fmtcnt, rescnt, reslen, arglen, argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007471 int args_owned = 0;
7472 PyUnicodeObject *result = NULL;
7473 PyObject *dict = NULL;
7474 PyObject *uformat;
Tim Petersced69f82003-09-16 20:30:58 +00007475
Guido van Rossumd57fd912000-03-10 22:53:23 +00007476 if (format == NULL || args == NULL) {
7477 PyErr_BadInternalCall();
7478 return NULL;
7479 }
7480 uformat = PyUnicode_FromObject(format);
Fred Drakee4315f52000-05-09 19:53:39 +00007481 if (uformat == NULL)
7482 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007483 fmt = PyUnicode_AS_UNICODE(uformat);
7484 fmtcnt = PyUnicode_GET_SIZE(uformat);
7485
7486 reslen = rescnt = fmtcnt + 100;
7487 result = _PyUnicode_New(reslen);
7488 if (result == NULL)
7489 goto onError;
7490 res = PyUnicode_AS_UNICODE(result);
7491
7492 if (PyTuple_Check(args)) {
7493 arglen = PyTuple_Size(args);
7494 argidx = 0;
7495 }
7496 else {
7497 arglen = -1;
7498 argidx = -2;
7499 }
Neal Norwitz80a1bf42002-11-12 23:01:12 +00007500 if (args->ob_type->tp_as_mapping && !PyTuple_Check(args) &&
7501 !PyObject_TypeCheck(args, &PyBaseString_Type))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007502 dict = args;
7503
7504 while (--fmtcnt >= 0) {
7505 if (*fmt != '%') {
7506 if (--rescnt < 0) {
7507 rescnt = fmtcnt + 100;
7508 reslen += rescnt;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00007509 if (_PyUnicode_Resize(&result, reslen) < 0)
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00007510 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007511 res = PyUnicode_AS_UNICODE(result) + reslen - rescnt;
7512 --rescnt;
7513 }
7514 *res++ = *fmt++;
7515 }
7516 else {
7517 /* Got a format specifier */
7518 int flags = 0;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007519 Py_ssize_t width = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007520 int prec = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007521 Py_UNICODE c = '\0';
7522 Py_UNICODE fill;
7523 PyObject *v = NULL;
7524 PyObject *temp = NULL;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007525 Py_UNICODE *pbuf;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007526 Py_UNICODE sign;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007527 Py_ssize_t len;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007528 Py_UNICODE formatbuf[FORMATBUFLEN]; /* For format{float,int,char}() */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007529
7530 fmt++;
7531 if (*fmt == '(') {
7532 Py_UNICODE *keystart;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007533 Py_ssize_t keylen;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007534 PyObject *key;
7535 int pcount = 1;
7536
7537 if (dict == NULL) {
7538 PyErr_SetString(PyExc_TypeError,
Tim Petersced69f82003-09-16 20:30:58 +00007539 "format requires a mapping");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007540 goto onError;
7541 }
7542 ++fmt;
7543 --fmtcnt;
7544 keystart = fmt;
7545 /* Skip over balanced parentheses */
7546 while (pcount > 0 && --fmtcnt >= 0) {
7547 if (*fmt == ')')
7548 --pcount;
7549 else if (*fmt == '(')
7550 ++pcount;
7551 fmt++;
7552 }
7553 keylen = fmt - keystart - 1;
7554 if (fmtcnt < 0 || pcount > 0) {
7555 PyErr_SetString(PyExc_ValueError,
7556 "incomplete format key");
7557 goto onError;
7558 }
Marc-André Lemburg72f82132001-11-20 15:18:49 +00007559#if 0
Fred Drakee4315f52000-05-09 19:53:39 +00007560 /* keys are converted to strings using UTF-8 and
Guido van Rossumd57fd912000-03-10 22:53:23 +00007561 then looked up since Python uses strings to hold
7562 variables names etc. in its namespaces and we
Fred Drakee4315f52000-05-09 19:53:39 +00007563 wouldn't want to break common idioms. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007564 key = PyUnicode_EncodeUTF8(keystart,
7565 keylen,
7566 NULL);
Marc-André Lemburg72f82132001-11-20 15:18:49 +00007567#else
7568 key = PyUnicode_FromUnicode(keystart, keylen);
7569#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00007570 if (key == NULL)
7571 goto onError;
7572 if (args_owned) {
7573 Py_DECREF(args);
7574 args_owned = 0;
7575 }
7576 args = PyObject_GetItem(dict, key);
7577 Py_DECREF(key);
7578 if (args == NULL) {
7579 goto onError;
7580 }
7581 args_owned = 1;
7582 arglen = -1;
7583 argidx = -2;
7584 }
7585 while (--fmtcnt >= 0) {
7586 switch (c = *fmt++) {
7587 case '-': flags |= F_LJUST; continue;
7588 case '+': flags |= F_SIGN; continue;
7589 case ' ': flags |= F_BLANK; continue;
7590 case '#': flags |= F_ALT; continue;
7591 case '0': flags |= F_ZERO; continue;
7592 }
7593 break;
7594 }
7595 if (c == '*') {
7596 v = getnextarg(args, arglen, &argidx);
7597 if (v == NULL)
7598 goto onError;
7599 if (!PyInt_Check(v)) {
7600 PyErr_SetString(PyExc_TypeError,
7601 "* wants int");
7602 goto onError;
7603 }
7604 width = PyInt_AsLong(v);
Guido van Rossumddefaf32007-01-14 03:31:43 +00007605 if (width == -1 && PyErr_Occurred())
7606 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007607 if (width < 0) {
7608 flags |= F_LJUST;
7609 width = -width;
7610 }
7611 if (--fmtcnt >= 0)
7612 c = *fmt++;
7613 }
7614 else if (c >= '0' && c <= '9') {
7615 width = c - '0';
7616 while (--fmtcnt >= 0) {
7617 c = *fmt++;
7618 if (c < '0' || c > '9')
7619 break;
7620 if ((width*10) / 10 != width) {
7621 PyErr_SetString(PyExc_ValueError,
7622 "width too big");
7623 goto onError;
7624 }
7625 width = width*10 + (c - '0');
7626 }
7627 }
7628 if (c == '.') {
7629 prec = 0;
7630 if (--fmtcnt >= 0)
7631 c = *fmt++;
7632 if (c == '*') {
7633 v = getnextarg(args, arglen, &argidx);
7634 if (v == NULL)
7635 goto onError;
7636 if (!PyInt_Check(v)) {
7637 PyErr_SetString(PyExc_TypeError,
7638 "* wants int");
7639 goto onError;
7640 }
7641 prec = PyInt_AsLong(v);
Guido van Rossumddefaf32007-01-14 03:31:43 +00007642 if (prec == -1 && PyErr_Occurred())
7643 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007644 if (prec < 0)
7645 prec = 0;
7646 if (--fmtcnt >= 0)
7647 c = *fmt++;
7648 }
7649 else if (c >= '0' && c <= '9') {
7650 prec = c - '0';
7651 while (--fmtcnt >= 0) {
7652 c = Py_CHARMASK(*fmt++);
7653 if (c < '0' || c > '9')
7654 break;
7655 if ((prec*10) / 10 != prec) {
7656 PyErr_SetString(PyExc_ValueError,
7657 "prec too big");
7658 goto onError;
7659 }
7660 prec = prec*10 + (c - '0');
7661 }
7662 }
7663 } /* prec */
7664 if (fmtcnt >= 0) {
7665 if (c == 'h' || c == 'l' || c == 'L') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007666 if (--fmtcnt >= 0)
7667 c = *fmt++;
7668 }
7669 }
7670 if (fmtcnt < 0) {
7671 PyErr_SetString(PyExc_ValueError,
7672 "incomplete format");
7673 goto onError;
7674 }
7675 if (c != '%') {
7676 v = getnextarg(args, arglen, &argidx);
7677 if (v == NULL)
7678 goto onError;
7679 }
7680 sign = 0;
7681 fill = ' ';
7682 switch (c) {
7683
7684 case '%':
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007685 pbuf = formatbuf;
7686 /* presume that buffer length is at least 1 */
7687 pbuf[0] = '%';
Guido van Rossumd57fd912000-03-10 22:53:23 +00007688 len = 1;
7689 break;
7690
7691 case 's':
7692 case 'r':
7693 if (PyUnicode_Check(v) && c == 's') {
7694 temp = v;
7695 Py_INCREF(temp);
7696 }
7697 else {
7698 PyObject *unicode;
7699 if (c == 's')
Marc-André Lemburgd25c6502004-07-23 16:13:25 +00007700 temp = PyObject_Unicode(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007701 else
7702 temp = PyObject_Repr(v);
7703 if (temp == NULL)
7704 goto onError;
Marc-André Lemburgd25c6502004-07-23 16:13:25 +00007705 if (PyUnicode_Check(temp))
7706 /* nothing to do */;
7707 else if (PyString_Check(temp)) {
7708 /* convert to string to Unicode */
Thomas Woutersa96affe2006-03-12 00:29:36 +00007709 unicode = PyUnicode_Decode(PyString_AS_STRING(temp),
Guido van Rossumd57fd912000-03-10 22:53:23 +00007710 PyString_GET_SIZE(temp),
Thomas Woutersa96affe2006-03-12 00:29:36 +00007711 NULL,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007712 "strict");
Thomas Woutersa96affe2006-03-12 00:29:36 +00007713 Py_DECREF(temp);
7714 temp = unicode;
7715 if (temp == NULL)
7716 goto onError;
7717 }
Marc-André Lemburgd25c6502004-07-23 16:13:25 +00007718 else {
7719 Py_DECREF(temp);
7720 PyErr_SetString(PyExc_TypeError,
7721 "%s argument has non-string str()");
7722 goto onError;
7723 }
7724 }
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007725 pbuf = PyUnicode_AS_UNICODE(temp);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007726 len = PyUnicode_GET_SIZE(temp);
7727 if (prec >= 0 && len > prec)
7728 len = prec;
7729 break;
7730
7731 case 'i':
7732 case 'd':
7733 case 'u':
7734 case 'o':
7735 case 'x':
7736 case 'X':
7737 if (c == 'i')
7738 c = 'd';
Tim Petersa3a3a032000-11-30 05:22:44 +00007739 if (PyLong_Check(v)) {
Tim Peters38fd5b62000-09-21 05:43:11 +00007740 temp = formatlong(v, flags, prec, c);
7741 if (!temp)
7742 goto onError;
7743 pbuf = PyUnicode_AS_UNICODE(temp);
7744 len = PyUnicode_GET_SIZE(temp);
Tim Peters38fd5b62000-09-21 05:43:11 +00007745 sign = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007746 }
Tim Peters38fd5b62000-09-21 05:43:11 +00007747 else {
7748 pbuf = formatbuf;
7749 len = formatint(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
7750 flags, prec, c, v);
7751 if (len < 0)
7752 goto onError;
Guido van Rossum6c9e1302003-11-29 23:52:13 +00007753 sign = 1;
Tim Peters38fd5b62000-09-21 05:43:11 +00007754 }
7755 if (flags & F_ZERO)
7756 fill = '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00007757 break;
7758
7759 case 'e':
7760 case 'E':
7761 case 'f':
Raymond Hettinger9bfe5332003-08-27 04:55:52 +00007762 case 'F':
Guido van Rossumd57fd912000-03-10 22:53:23 +00007763 case 'g':
7764 case 'G':
Raymond Hettinger9bfe5332003-08-27 04:55:52 +00007765 if (c == 'F')
7766 c = 'f';
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007767 pbuf = formatbuf;
7768 len = formatfloat(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
7769 flags, prec, c, v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007770 if (len < 0)
7771 goto onError;
7772 sign = 1;
Tim Peters38fd5b62000-09-21 05:43:11 +00007773 if (flags & F_ZERO)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007774 fill = '0';
7775 break;
7776
7777 case 'c':
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007778 pbuf = formatbuf;
7779 len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007780 if (len < 0)
7781 goto onError;
7782 break;
7783
7784 default:
7785 PyErr_Format(PyExc_ValueError,
Andrew M. Kuchling6ca89172000-12-15 13:07:46 +00007786 "unsupported format character '%c' (0x%x) "
Thomas Wouters89f507f2006-12-13 04:49:30 +00007787 "at index %zd",
Tim Petersced69f82003-09-16 20:30:58 +00007788 (31<=c && c<=126) ? (char)c : '?',
Marc-André Lemburg24e53b62002-09-24 09:32:14 +00007789 (int)c,
Thomas Wouters89f507f2006-12-13 04:49:30 +00007790 (Py_ssize_t)(fmt - 1 -
7791 PyUnicode_AS_UNICODE(uformat)));
Guido van Rossumd57fd912000-03-10 22:53:23 +00007792 goto onError;
7793 }
7794 if (sign) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007795 if (*pbuf == '-' || *pbuf == '+') {
7796 sign = *pbuf++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007797 len--;
7798 }
7799 else if (flags & F_SIGN)
7800 sign = '+';
7801 else if (flags & F_BLANK)
7802 sign = ' ';
7803 else
7804 sign = 0;
7805 }
7806 if (width < len)
7807 width = len;
Guido van Rossum049cd6b2002-10-11 00:43:48 +00007808 if (rescnt - (sign != 0) < width) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007809 reslen -= rescnt;
7810 rescnt = width + fmtcnt + 100;
7811 reslen += rescnt;
Guido van Rossum049cd6b2002-10-11 00:43:48 +00007812 if (reslen < 0) {
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00007813 Py_XDECREF(temp);
Thomas Woutersa96affe2006-03-12 00:29:36 +00007814 PyErr_NoMemory();
7815 goto onError;
Guido van Rossum049cd6b2002-10-11 00:43:48 +00007816 }
Thomas Woutersa96affe2006-03-12 00:29:36 +00007817 if (_PyUnicode_Resize(&result, reslen) < 0) {
7818 Py_XDECREF(temp);
7819 goto onError;
7820 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007821 res = PyUnicode_AS_UNICODE(result)
7822 + reslen - rescnt;
7823 }
7824 if (sign) {
7825 if (fill != ' ')
7826 *res++ = sign;
7827 rescnt--;
7828 if (width > len)
7829 width--;
7830 }
Tim Peters38fd5b62000-09-21 05:43:11 +00007831 if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
7832 assert(pbuf[0] == '0');
Tim Petersfff53252001-04-12 18:38:48 +00007833 assert(pbuf[1] == c);
7834 if (fill != ' ') {
7835 *res++ = *pbuf++;
7836 *res++ = *pbuf++;
Tim Peters38fd5b62000-09-21 05:43:11 +00007837 }
Tim Petersfff53252001-04-12 18:38:48 +00007838 rescnt -= 2;
7839 width -= 2;
7840 if (width < 0)
7841 width = 0;
7842 len -= 2;
Tim Peters38fd5b62000-09-21 05:43:11 +00007843 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007844 if (width > len && !(flags & F_LJUST)) {
7845 do {
7846 --rescnt;
7847 *res++ = fill;
7848 } while (--width > len);
7849 }
Tim Peters38fd5b62000-09-21 05:43:11 +00007850 if (fill == ' ') {
7851 if (sign)
7852 *res++ = sign;
Tim Petersfff53252001-04-12 18:38:48 +00007853 if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
Tim Peters38fd5b62000-09-21 05:43:11 +00007854 assert(pbuf[0] == '0');
Tim Petersfff53252001-04-12 18:38:48 +00007855 assert(pbuf[1] == c);
Tim Peters38fd5b62000-09-21 05:43:11 +00007856 *res++ = *pbuf++;
7857 *res++ = *pbuf++;
7858 }
7859 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00007860 Py_UNICODE_COPY(res, pbuf, len);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007861 res += len;
7862 rescnt -= len;
7863 while (--width >= len) {
7864 --rescnt;
7865 *res++ = ' ';
7866 }
7867 if (dict && (argidx < arglen) && c != '%') {
7868 PyErr_SetString(PyExc_TypeError,
Raymond Hettinger0ebac972002-05-21 15:14:57 +00007869 "not all arguments converted during string formatting");
Thomas Woutersa96affe2006-03-12 00:29:36 +00007870 Py_XDECREF(temp);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007871 goto onError;
7872 }
7873 Py_XDECREF(temp);
7874 } /* '%' */
7875 } /* until end */
7876 if (argidx < arglen && !dict) {
7877 PyErr_SetString(PyExc_TypeError,
Raymond Hettinger0ebac972002-05-21 15:14:57 +00007878 "not all arguments converted during string formatting");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007879 goto onError;
7880 }
7881
Thomas Woutersa96affe2006-03-12 00:29:36 +00007882 if (_PyUnicode_Resize(&result, reslen - rescnt) < 0)
7883 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007884 if (args_owned) {
7885 Py_DECREF(args);
7886 }
7887 Py_DECREF(uformat);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007888 return (PyObject *)result;
7889
7890 onError:
7891 Py_XDECREF(result);
7892 Py_DECREF(uformat);
7893 if (args_owned) {
7894 Py_DECREF(args);
7895 }
7896 return NULL;
7897}
7898
7899static PyBufferProcs unicode_as_buffer = {
Martin v. Löwis18e16552006-02-15 17:27:45 +00007900 (readbufferproc) unicode_buffer_getreadbuf,
7901 (writebufferproc) unicode_buffer_getwritebuf,
7902 (segcountproc) unicode_buffer_getsegcount,
7903 (charbufferproc) unicode_buffer_getcharbuf,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007904};
7905
Jeremy Hylton938ace62002-07-17 16:30:39 +00007906static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +00007907unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
7908
Tim Peters6d6c1a32001-08-02 04:15:00 +00007909static PyObject *
7910unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
7911{
7912 PyObject *x = NULL;
Martin v. Löwis15e62742006-02-27 16:46:16 +00007913 static char *kwlist[] = {"string", "encoding", "errors", 0};
Tim Peters6d6c1a32001-08-02 04:15:00 +00007914 char *encoding = NULL;
7915 char *errors = NULL;
7916
Guido van Rossume023fe02001-08-30 03:12:59 +00007917 if (type != &PyUnicode_Type)
7918 return unicode_subtype_new(type, args, kwds);
Tim Peters6d6c1a32001-08-02 04:15:00 +00007919 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:unicode",
7920 kwlist, &x, &encoding, &errors))
7921 return NULL;
7922 if (x == NULL)
7923 return (PyObject *)_PyUnicode_New(0);
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00007924 if (encoding == NULL && errors == NULL)
7925 return PyObject_Unicode(x);
7926 else
Tim Peters6d6c1a32001-08-02 04:15:00 +00007927 return PyUnicode_FromEncodedObject(x, encoding, errors);
7928}
7929
Guido van Rossume023fe02001-08-30 03:12:59 +00007930static PyObject *
7931unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
7932{
Tim Petersaf90b3e2001-09-12 05:18:58 +00007933 PyUnicodeObject *tmp, *pnew;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007934 Py_ssize_t n;
Guido van Rossume023fe02001-08-30 03:12:59 +00007935
7936 assert(PyType_IsSubtype(type, &PyUnicode_Type));
7937 tmp = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds);
7938 if (tmp == NULL)
7939 return NULL;
7940 assert(PyUnicode_Check(tmp));
Tim Petersaf90b3e2001-09-12 05:18:58 +00007941 pnew = (PyUnicodeObject *) type->tp_alloc(type, n = tmp->length);
Raymond Hettingerf4667932003-06-28 20:04:25 +00007942 if (pnew == NULL) {
7943 Py_DECREF(tmp);
Guido van Rossume023fe02001-08-30 03:12:59 +00007944 return NULL;
Raymond Hettingerf4667932003-06-28 20:04:25 +00007945 }
Tim Petersaf90b3e2001-09-12 05:18:58 +00007946 pnew->str = PyMem_NEW(Py_UNICODE, n+1);
7947 if (pnew->str == NULL) {
7948 _Py_ForgetReference((PyObject *)pnew);
Neil Schemenauer58aa8612002-04-12 03:07:20 +00007949 PyObject_Del(pnew);
Raymond Hettingerf4667932003-06-28 20:04:25 +00007950 Py_DECREF(tmp);
Neal Norwitzec74f2f2003-02-11 23:05:40 +00007951 return PyErr_NoMemory();
Guido van Rossume023fe02001-08-30 03:12:59 +00007952 }
Tim Petersaf90b3e2001-09-12 05:18:58 +00007953 Py_UNICODE_COPY(pnew->str, tmp->str, n+1);
7954 pnew->length = n;
7955 pnew->hash = tmp->hash;
Guido van Rossume023fe02001-08-30 03:12:59 +00007956 Py_DECREF(tmp);
Tim Petersaf90b3e2001-09-12 05:18:58 +00007957 return (PyObject *)pnew;
Guido van Rossume023fe02001-08-30 03:12:59 +00007958}
7959
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007960PyDoc_STRVAR(unicode_doc,
Tim Peters6d6c1a32001-08-02 04:15:00 +00007961"unicode(string [, encoding[, errors]]) -> object\n\
7962\n\
7963Create a new Unicode object from the given encoded string.\n\
Skip Montanaro35b37a52002-07-26 16:22:46 +00007964encoding defaults to the current default string encoding.\n\
7965errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +00007966
Guido van Rossum50e9fb92006-08-17 05:42:55 +00007967static PyObject *unicode_iter(PyObject *seq);
7968
Guido van Rossumd57fd912000-03-10 22:53:23 +00007969PyTypeObject PyUnicode_Type = {
7970 PyObject_HEAD_INIT(&PyType_Type)
7971 0, /* ob_size */
Guido van Rossum84fc66d2007-05-03 17:18:26 +00007972 "str", /* tp_name */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007973 sizeof(PyUnicodeObject), /* tp_size */
7974 0, /* tp_itemsize */
7975 /* Slots */
Guido van Rossum9475a232001-10-05 20:51:39 +00007976 (destructor)unicode_dealloc, /* tp_dealloc */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007977 0, /* tp_print */
Tim Peters6d6c1a32001-08-02 04:15:00 +00007978 0, /* tp_getattr */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007979 0, /* tp_setattr */
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00007980 0, /* tp_compare */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007981 unicode_repr, /* tp_repr */
Neil Schemenauerce30bc92002-11-18 16:10:18 +00007982 &unicode_as_number, /* tp_as_number */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007983 &unicode_as_sequence, /* tp_as_sequence */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007984 &unicode_as_mapping, /* tp_as_mapping */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007985 (hashfunc) unicode_hash, /* tp_hash*/
7986 0, /* tp_call*/
7987 (reprfunc) unicode_str, /* tp_str */
Tim Peters6d6c1a32001-08-02 04:15:00 +00007988 PyObject_GenericGetAttr, /* tp_getattro */
7989 0, /* tp_setattro */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007990 &unicode_as_buffer, /* tp_as_buffer */
Thomas Wouters27d517b2007-02-25 20:39:11 +00007991 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
7992 Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
Tim Peters6d6c1a32001-08-02 04:15:00 +00007993 unicode_doc, /* tp_doc */
7994 0, /* tp_traverse */
7995 0, /* tp_clear */
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00007996 PyUnicode_RichCompare, /* tp_richcompare */
Tim Peters6d6c1a32001-08-02 04:15:00 +00007997 0, /* tp_weaklistoffset */
Guido van Rossum50e9fb92006-08-17 05:42:55 +00007998 unicode_iter, /* tp_iter */
Tim Peters6d6c1a32001-08-02 04:15:00 +00007999 0, /* tp_iternext */
8000 unicode_methods, /* tp_methods */
8001 0, /* tp_members */
8002 0, /* tp_getset */
Guido van Rossumcacfc072002-05-24 19:01:59 +00008003 &PyBaseString_Type, /* tp_base */
Tim Peters6d6c1a32001-08-02 04:15:00 +00008004 0, /* tp_dict */
8005 0, /* tp_descr_get */
8006 0, /* tp_descr_set */
8007 0, /* tp_dictoffset */
8008 0, /* tp_init */
8009 0, /* tp_alloc */
8010 unicode_new, /* tp_new */
Neil Schemenauer58aa8612002-04-12 03:07:20 +00008011 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008012};
8013
8014/* Initialize the Unicode implementation */
8015
Thomas Wouters78890102000-07-22 19:25:51 +00008016void _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008017{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008018 int i;
8019
Thomas Wouters477c8d52006-05-27 19:21:47 +00008020 /* XXX - move this array to unicodectype.c ? */
8021 Py_UNICODE linebreak[] = {
8022 0x000A, /* LINE FEED */
8023 0x000D, /* CARRIAGE RETURN */
8024 0x001C, /* FILE SEPARATOR */
8025 0x001D, /* GROUP SEPARATOR */
8026 0x001E, /* RECORD SEPARATOR */
8027 0x0085, /* NEXT LINE */
8028 0x2028, /* LINE SEPARATOR */
8029 0x2029, /* PARAGRAPH SEPARATOR */
8030 };
8031
Fred Drakee4315f52000-05-09 19:53:39 +00008032 /* Init the implementation */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008033 unicode_freelist = NULL;
8034 unicode_freelist_size = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008035 unicode_empty = _PyUnicode_New(0);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008036 if (!unicode_empty)
8037 return;
8038
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008039 for (i = 0; i < 256; i++)
8040 unicode_latin1[i] = NULL;
Guido van Rossumcacfc072002-05-24 19:01:59 +00008041 if (PyType_Ready(&PyUnicode_Type) < 0)
8042 Py_FatalError("Can't initialize 'unicode'");
Thomas Wouters477c8d52006-05-27 19:21:47 +00008043
8044 /* initialize the linebreak bloom filter */
8045 bloom_linebreak = make_bloom_mask(
8046 linebreak, sizeof(linebreak) / sizeof(linebreak[0])
8047 );
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008048
8049 PyType_Ready(&EncodingMapType);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008050}
8051
8052/* Finalize the Unicode implementation */
8053
8054void
Thomas Wouters78890102000-07-22 19:25:51 +00008055_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008056{
Barry Warsaw5b4c2282000-10-03 20:45:26 +00008057 PyUnicodeObject *u;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008058 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008059
Guido van Rossum4ae8ef82000-10-03 18:09:04 +00008060 Py_XDECREF(unicode_empty);
8061 unicode_empty = NULL;
Barry Warsaw5b4c2282000-10-03 20:45:26 +00008062
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008063 for (i = 0; i < 256; i++) {
8064 if (unicode_latin1[i]) {
8065 Py_DECREF(unicode_latin1[i]);
8066 unicode_latin1[i] = NULL;
8067 }
8068 }
8069
Barry Warsaw5b4c2282000-10-03 20:45:26 +00008070 for (u = unicode_freelist; u != NULL;) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00008071 PyUnicodeObject *v = u;
8072 u = *(PyUnicodeObject **)u;
Guido van Rossumfd4b9572000-04-10 13:51:10 +00008073 if (v->str)
Guido van Rossumb18618d2000-05-03 23:44:39 +00008074 PyMem_DEL(v->str);
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00008075 Py_XDECREF(v->defenc);
Neil Schemenauer58aa8612002-04-12 03:07:20 +00008076 PyObject_Del(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008077 }
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008078 unicode_freelist = NULL;
8079 unicode_freelist_size = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008080}
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00008081
Guido van Rossum50e9fb92006-08-17 05:42:55 +00008082
8083
8084/********************* Unicode Iterator **************************/
8085
8086typedef struct {
8087 PyObject_HEAD
Guido van Rossum49d6b072006-08-17 21:11:47 +00008088 Py_ssize_t it_index;
Guido van Rossum50e9fb92006-08-17 05:42:55 +00008089 PyUnicodeObject *it_seq; /* Set to NULL when iterator is exhausted */
8090} unicodeiterobject;
8091
8092static void
8093unicodeiter_dealloc(unicodeiterobject *it)
8094{
8095 _PyObject_GC_UNTRACK(it);
8096 Py_XDECREF(it->it_seq);
8097 PyObject_GC_Del(it);
8098}
8099
8100static int
8101unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
8102{
8103 Py_VISIT(it->it_seq);
8104 return 0;
8105}
8106
8107static PyObject *
8108unicodeiter_next(unicodeiterobject *it)
8109{
8110 PyUnicodeObject *seq;
8111 PyObject *item;
8112
8113 assert(it != NULL);
8114 seq = it->it_seq;
8115 if (seq == NULL)
8116 return NULL;
8117 assert(PyUnicode_Check(seq));
8118
8119 if (it->it_index < PyUnicode_GET_SIZE(seq)) {
Guido van Rossum49d6b072006-08-17 21:11:47 +00008120 item = PyUnicode_FromUnicode(
8121 PyUnicode_AS_UNICODE(seq)+it->it_index, 1);
Guido van Rossum50e9fb92006-08-17 05:42:55 +00008122 if (item != NULL)
8123 ++it->it_index;
8124 return item;
8125 }
8126
8127 Py_DECREF(seq);
8128 it->it_seq = NULL;
8129 return NULL;
8130}
8131
8132static PyObject *
8133unicodeiter_len(unicodeiterobject *it)
8134{
8135 Py_ssize_t len = 0;
8136 if (it->it_seq)
8137 len = PyUnicode_GET_SIZE(it->it_seq) - it->it_index;
8138 return PyInt_FromSsize_t(len);
8139}
8140
8141PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
8142
8143static PyMethodDef unicodeiter_methods[] = {
Guido van Rossum49d6b072006-08-17 21:11:47 +00008144 {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
8145 length_hint_doc},
Guido van Rossum50e9fb92006-08-17 05:42:55 +00008146 {NULL, NULL} /* sentinel */
8147};
8148
8149PyTypeObject PyUnicodeIter_Type = {
8150 PyObject_HEAD_INIT(&PyType_Type)
8151 0, /* ob_size */
8152 "unicodeiterator", /* tp_name */
8153 sizeof(unicodeiterobject), /* tp_basicsize */
8154 0, /* tp_itemsize */
8155 /* methods */
Guido van Rossum49d6b072006-08-17 21:11:47 +00008156 (destructor)unicodeiter_dealloc, /* tp_dealloc */
Guido van Rossum50e9fb92006-08-17 05:42:55 +00008157 0, /* tp_print */
8158 0, /* tp_getattr */
8159 0, /* tp_setattr */
8160 0, /* tp_compare */
8161 0, /* tp_repr */
8162 0, /* tp_as_number */
8163 0, /* tp_as_sequence */
8164 0, /* tp_as_mapping */
8165 0, /* tp_hash */
8166 0, /* tp_call */
8167 0, /* tp_str */
8168 PyObject_GenericGetAttr, /* tp_getattro */
8169 0, /* tp_setattro */
8170 0, /* tp_as_buffer */
8171 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
8172 0, /* tp_doc */
8173 (traverseproc)unicodeiter_traverse, /* tp_traverse */
8174 0, /* tp_clear */
8175 0, /* tp_richcompare */
8176 0, /* tp_weaklistoffset */
8177 PyObject_SelfIter, /* tp_iter */
8178 (iternextfunc)unicodeiter_next, /* tp_iternext */
8179 unicodeiter_methods, /* tp_methods */
8180 0,
8181};
8182
8183static PyObject *
8184unicode_iter(PyObject *seq)
8185{
8186 unicodeiterobject *it;
8187
8188 if (!PyUnicode_Check(seq)) {
8189 PyErr_BadInternalCall();
8190 return NULL;
8191 }
8192 it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
8193 if (it == NULL)
8194 return NULL;
8195 it->it_index = 0;
8196 Py_INCREF(seq);
8197 it->it_seq = (PyUnicodeObject *)seq;
8198 _PyObject_GC_TRACK(it);
8199 return (PyObject *)it;
8200}
8201
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008202#ifdef __cplusplus
8203}
8204#endif
8205
8206
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00008207/*
8208Local variables:
8209c-basic-offset: 4
8210indent-tabs-mode: nil
8211End:
8212*/