blob: 999b1661eb26c4ea9777c9ab5fab6b4c65c54402 [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Fred Drake785d14f2000-05-09 19:54:43 +00004modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
Guido van Rossumd57fd912000-03-10 22:53:23 +00005Unicode Integration Proposal (see file Misc/unicode.txt).
6
Thomas Wouters477c8d52006-05-27 19:21:47 +00007Major speed upgrades to the method implementations at the Reykjavik
8NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
9
Guido van Rossum16b1ad92000-08-03 16:24:25 +000010Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000011
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000012--------------------------------------------------------------------
13The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000014
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000015 Copyright (c) 1999 by Secret Labs AB
16 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000017
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000018By obtaining, using, and/or copying this software and/or its
19associated documentation, you agree that you have read, understood,
20and will comply with the following terms and conditions:
21
22Permission to use, copy, modify, and distribute this software and its
23associated documentation for any purpose and without fee is hereby
24granted, provided that the above copyright notice appears in all
25copies, and that both that copyright notice and this permission notice
26appear in supporting documentation, and that the name of Secret Labs
27AB or the author not be used in advertising or publicity pertaining to
28distribution of the software without specific, written prior
29permission.
30
31SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
32THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
33FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
34ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
35WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
36ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
37OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
38--------------------------------------------------------------------
39
40*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000041
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000042#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000043#include "Python.h"
44
Guido van Rossumd57fd912000-03-10 22:53:23 +000045#include "unicodeobject.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000046#include "ucnhash.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000047
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000048#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000049#include <windows.h>
50#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000051
Guido van Rossumd57fd912000-03-10 22:53:23 +000052/* Limit for the Unicode object free list */
53
54#define MAX_UNICODE_FREELIST_SIZE 1024
55
56/* Limit for the Unicode object free list stay alive optimization.
57
58 The implementation will keep allocated Unicode memory intact for
59 all objects on the free list having a size less than this
Tim Petersced69f82003-09-16 20:30:58 +000060 limit. This reduces malloc() overhead for small Unicode objects.
Guido van Rossumd57fd912000-03-10 22:53:23 +000061
Barry Warsaw51ac5802000-03-20 16:36:48 +000062 At worst this will result in MAX_UNICODE_FREELIST_SIZE *
Guido van Rossumfd4b9572000-04-10 13:51:10 +000063 (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
Guido van Rossumd57fd912000-03-10 22:53:23 +000064 malloc()-overhead) bytes of unused garbage.
65
66 Setting the limit to 0 effectively turns the feature off.
67
Guido van Rossumfd4b9572000-04-10 13:51:10 +000068 Note: This is an experimental feature ! If you get core dumps when
69 using Unicode objects, turn this feature off.
Guido van Rossumd57fd912000-03-10 22:53:23 +000070
71*/
72
Guido van Rossumfd4b9572000-04-10 13:51:10 +000073#define KEEPALIVE_SIZE_LIMIT 9
Guido van Rossumd57fd912000-03-10 22:53:23 +000074
75/* Endianness switches; defaults to little endian */
76
77#ifdef WORDS_BIGENDIAN
78# define BYTEORDER_IS_BIG_ENDIAN
79#else
80# define BYTEORDER_IS_LITTLE_ENDIAN
81#endif
82
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000083/* --- Globals ------------------------------------------------------------
84
85 The globals are initialized by the _PyUnicode_Init() API and should
86 not be used before calling that API.
87
88*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000089
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000090
91#ifdef __cplusplus
92extern "C" {
93#endif
94
Guido van Rossumd57fd912000-03-10 22:53:23 +000095/* Free list for Unicode objects */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000096static PyUnicodeObject *unicode_freelist;
97static int unicode_freelist_size;
Guido van Rossumd57fd912000-03-10 22:53:23 +000098
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000099/* The empty Unicode object is shared to improve performance. */
100static PyUnicodeObject *unicode_empty;
101
102/* Single character Unicode strings in the Latin-1 range are being
103 shared as well. */
104static PyUnicodeObject *unicode_latin1[256];
105
Fred Drakee4315f52000-05-09 19:53:39 +0000106/* Default encoding to use and assume when NULL is passed as encoding
Guido van Rossumf15a29f2007-05-04 00:41:39 +0000107 parameter; it is fixed to "utf-8". Always use the
108 PyUnicode_GetDefaultEncoding() API to access this global. */
109static const char unicode_default_encoding[] = "utf-8";
Fred Drakee4315f52000-05-09 19:53:39 +0000110
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000111Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000112PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000113{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000114#ifdef Py_UNICODE_WIDE
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000115 return 0x10FFFF;
116#else
117 /* This is actually an illegal character, so it should
118 not be passed to unichr. */
119 return 0xFFFF;
120#endif
121}
122
Thomas Wouters477c8d52006-05-27 19:21:47 +0000123/* --- Bloom Filters ----------------------------------------------------- */
124
125/* stuff to implement simple "bloom filters" for Unicode characters.
126 to keep things simple, we use a single bitmask, using the least 5
127 bits from each unicode characters as the bit index. */
128
129/* the linebreak mask is set up by Unicode_Init below */
130
131#define BLOOM_MASK unsigned long
132
133static BLOOM_MASK bloom_linebreak;
134
135#define BLOOM(mask, ch) ((mask & (1 << ((ch) & 0x1F))))
136
137#define BLOOM_LINEBREAK(ch)\
138 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK((ch)))
139
140Py_LOCAL_INLINE(BLOOM_MASK) make_bloom_mask(Py_UNICODE* ptr, Py_ssize_t len)
141{
142 /* calculate simple bloom-style bitmask for a given unicode string */
143
144 long mask;
145 Py_ssize_t i;
146
147 mask = 0;
148 for (i = 0; i < len; i++)
149 mask |= (1 << (ptr[i] & 0x1F));
150
151 return mask;
152}
153
154Py_LOCAL_INLINE(int) unicode_member(Py_UNICODE chr, Py_UNICODE* set, Py_ssize_t setlen)
155{
156 Py_ssize_t i;
157
158 for (i = 0; i < setlen; i++)
159 if (set[i] == chr)
160 return 1;
161
162 return 0;
163}
164
165#define BLOOM_MEMBER(mask, chr, set, setlen)\
166 BLOOM(mask, chr) && unicode_member(chr, set, setlen)
167
Guido van Rossumd57fd912000-03-10 22:53:23 +0000168/* --- Unicode Object ----------------------------------------------------- */
169
170static
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000171int unicode_resize(register PyUnicodeObject *unicode,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000172 Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000173{
174 void *oldstr;
Tim Petersced69f82003-09-16 20:30:58 +0000175
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000176 /* Shortcut if there's nothing much to do. */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000177 if (unicode->length == length)
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000178 goto reset;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000179
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000180 /* Resizing shared object (unicode_empty or single character
181 objects) in-place is not allowed. Use PyUnicode_Resize()
182 instead ! */
Thomas Wouters477c8d52006-05-27 19:21:47 +0000183
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000184 if (unicode == unicode_empty ||
185 (unicode->length == 1 &&
186 unicode->str[0] < 256U &&
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000187 unicode_latin1[unicode->str[0]] == unicode)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000188 PyErr_SetString(PyExc_SystemError,
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000189 "can't resize shared unicode objects");
Guido van Rossumd57fd912000-03-10 22:53:23 +0000190 return -1;
191 }
192
Thomas Wouters477c8d52006-05-27 19:21:47 +0000193 /* We allocate one more byte to make sure the string is Ux0000 terminated.
194 The overallocation is also used by fastsearch, which assumes that it's
195 safe to look at str[length] (without making any assumptions about what
196 it contains). */
197
Guido van Rossumd57fd912000-03-10 22:53:23 +0000198 oldstr = unicode->str;
199 PyMem_RESIZE(unicode->str, Py_UNICODE, length + 1);
200 if (!unicode->str) {
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000201 unicode->str = (Py_UNICODE *)oldstr;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000202 PyErr_NoMemory();
203 return -1;
204 }
205 unicode->str[length] = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000206 unicode->length = length;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000207
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000208 reset:
Guido van Rossumd57fd912000-03-10 22:53:23 +0000209 /* Reset the object caches */
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000210 if (unicode->defenc) {
211 Py_DECREF(unicode->defenc);
212 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000213 }
214 unicode->hash = -1;
Tim Petersced69f82003-09-16 20:30:58 +0000215
Guido van Rossumd57fd912000-03-10 22:53:23 +0000216 return 0;
217}
218
219/* We allocate one more byte to make sure the string is
Tim Petersced69f82003-09-16 20:30:58 +0000220 Ux0000 terminated -- XXX is this needed ?
Guido van Rossumd57fd912000-03-10 22:53:23 +0000221
222 XXX This allocator could further be enhanced by assuring that the
223 free list never reduces its size below 1.
224
225*/
226
227static
Martin v. Löwis18e16552006-02-15 17:27:45 +0000228PyUnicodeObject *_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000229{
230 register PyUnicodeObject *unicode;
231
Thomas Wouters477c8d52006-05-27 19:21:47 +0000232 /* Optimization for empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000233 if (length == 0 && unicode_empty != NULL) {
234 Py_INCREF(unicode_empty);
235 return unicode_empty;
236 }
237
238 /* Unicode freelist & memory allocation */
239 if (unicode_freelist) {
240 unicode = unicode_freelist;
Marc-André Lemburgbea47e72000-06-17 20:31:17 +0000241 unicode_freelist = *(PyUnicodeObject **)unicode;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000242 unicode_freelist_size--;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000243 if (unicode->str) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000244 /* Keep-Alive optimization: we only upsize the buffer,
245 never downsize it. */
Tim Petersced69f82003-09-16 20:30:58 +0000246 if ((unicode->length < length) &&
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +0000247 unicode_resize(unicode, length) < 0) {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000248 PyMem_DEL(unicode->str);
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000249 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000250 }
251 }
Guido van Rossumad98db12001-06-14 17:52:02 +0000252 else {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000253 unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
Guido van Rossumad98db12001-06-14 17:52:02 +0000254 }
255 PyObject_INIT(unicode, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000256 }
257 else {
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000258 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000259 if (unicode == NULL)
260 return NULL;
261 unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
262 }
263
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000264 if (!unicode->str) {
265 PyErr_NoMemory();
Barry Warsaw51ac5802000-03-20 16:36:48 +0000266 goto onError;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000267 }
Jeremy Hyltond8082792003-09-16 19:41:39 +0000268 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +0000269 * the caller fails before initializing str -- unicode_resize()
270 * reads str[0], and the Keep-Alive optimization can keep memory
271 * allocated for str alive across a call to unicode_dealloc(unicode).
272 * We don't want unicode_resize to read uninitialized memory in
273 * that case.
274 */
Jeremy Hyltond8082792003-09-16 19:41:39 +0000275 unicode->str[0] = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000276 unicode->str[length] = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000277 unicode->length = length;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000278 unicode->hash = -1;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000279 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000280 return unicode;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000281
282 onError:
283 _Py_ForgetReference((PyObject *)unicode);
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000284 PyObject_Del(unicode);
Barry Warsaw51ac5802000-03-20 16:36:48 +0000285 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000286}
287
288static
Guido van Rossum9475a232001-10-05 20:51:39 +0000289void unicode_dealloc(register PyUnicodeObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000290{
Guido van Rossum604ddf82001-12-06 20:03:56 +0000291 if (PyUnicode_CheckExact(unicode) &&
292 unicode_freelist_size < MAX_UNICODE_FREELIST_SIZE) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000293 /* Keep-Alive optimization */
294 if (unicode->length >= KEEPALIVE_SIZE_LIMIT) {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000295 PyMem_DEL(unicode->str);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000296 unicode->str = NULL;
297 unicode->length = 0;
298 }
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000299 if (unicode->defenc) {
300 Py_DECREF(unicode->defenc);
301 unicode->defenc = NULL;
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000302 }
303 /* Add to free list */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000304 *(PyUnicodeObject **)unicode = unicode_freelist;
305 unicode_freelist = unicode;
306 unicode_freelist_size++;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000307 }
308 else {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000309 PyMem_DEL(unicode->str);
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000310 Py_XDECREF(unicode->defenc);
Guido van Rossum604ddf82001-12-06 20:03:56 +0000311 unicode->ob_type->tp_free((PyObject *)unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000312 }
313}
314
Martin v. Löwis18e16552006-02-15 17:27:45 +0000315int PyUnicode_Resize(PyObject **unicode, Py_ssize_t length)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000316{
317 register PyUnicodeObject *v;
318
319 /* Argument checks */
320 if (unicode == NULL) {
321 PyErr_BadInternalCall();
322 return -1;
323 }
324 v = (PyUnicodeObject *)*unicode;
Guido van Rossum049cd6b2002-10-11 00:43:48 +0000325 if (v == NULL || !PyUnicode_Check(v) || v->ob_refcnt != 1 || length < 0) {
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000326 PyErr_BadInternalCall();
327 return -1;
328 }
329
330 /* Resizing unicode_empty and single character objects is not
331 possible since these are being shared. We simply return a fresh
332 copy with the same Unicode content. */
Tim Petersced69f82003-09-16 20:30:58 +0000333 if (v->length != length &&
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000334 (v == unicode_empty || v->length == 1)) {
335 PyUnicodeObject *w = _PyUnicode_New(length);
336 if (w == NULL)
337 return -1;
338 Py_UNICODE_COPY(w->str, v->str,
339 length < v->length ? length : v->length);
Raymond Hettingerc8df5782003-03-09 07:30:43 +0000340 Py_DECREF(*unicode);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000341 *unicode = (PyObject *)w;
342 return 0;
343 }
344
345 /* Note that we don't have to modify *unicode for unshared Unicode
346 objects, since we can modify them in-place. */
347 return unicode_resize(v, length);
348}
349
350/* Internal API for use in unicodeobject.c only ! */
351#define _PyUnicode_Resize(unicodevar, length) \
352 PyUnicode_Resize(((PyObject **)(unicodevar)), length)
353
Guido van Rossumd57fd912000-03-10 22:53:23 +0000354PyObject *PyUnicode_FromUnicode(const Py_UNICODE *u,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000355 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000356{
357 PyUnicodeObject *unicode;
358
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000359 /* If the Unicode data is known at construction time, we can apply
360 some optimizations which share commonly used objects. */
361 if (u != NULL) {
362
363 /* Optimization for empty strings */
364 if (size == 0 && unicode_empty != NULL) {
365 Py_INCREF(unicode_empty);
366 return (PyObject *)unicode_empty;
367 }
368
369 /* Single character Unicode objects in the Latin-1 range are
370 shared when using this constructor */
371 if (size == 1 && *u < 256) {
372 unicode = unicode_latin1[*u];
373 if (!unicode) {
374 unicode = _PyUnicode_New(1);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000375 if (!unicode)
376 return NULL;
Marc-André Lemburg8879a332001-06-07 12:26:56 +0000377 unicode->str[0] = *u;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000378 unicode_latin1[*u] = unicode;
379 }
380 Py_INCREF(unicode);
381 return (PyObject *)unicode;
382 }
383 }
Tim Petersced69f82003-09-16 20:30:58 +0000384
Guido van Rossumd57fd912000-03-10 22:53:23 +0000385 unicode = _PyUnicode_New(size);
386 if (!unicode)
387 return NULL;
388
389 /* Copy the Unicode data into the new object */
390 if (u != NULL)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000391 Py_UNICODE_COPY(unicode->str, u, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000392
393 return (PyObject *)unicode;
394}
395
Walter Dörwaldd2034312007-05-18 16:29:38 +0000396PyObject *PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000397{
398 PyUnicodeObject *unicode;
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000399 /* If the Unicode data is known at construction time, we can apply
400 some optimizations which share commonly used objects. */
401 if (u != NULL) {
402
403 /* Optimization for empty strings */
404 if (size == 0 && unicode_empty != NULL) {
405 Py_INCREF(unicode_empty);
406 return (PyObject *)unicode_empty;
407 }
408
Walter Dörwald071b9da2007-05-05 14:21:20 +0000409 /* Single characters are shared when using this constructor */
410 if (size == 1) {
Walter Dörwaldce32db32007-05-05 14:26:59 +0000411 unicode = unicode_latin1[(int)*u];
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000412 if (!unicode) {
413 unicode = _PyUnicode_New(1);
414 if (!unicode)
415 return NULL;
416 unicode->str[0] = *u;
Walter Dörwaldce32db32007-05-05 14:26:59 +0000417 unicode_latin1[(int)*u] = unicode;
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000418 }
419 Py_INCREF(unicode);
420 return (PyObject *)unicode;
421 }
422 }
423
Walter Dörwald55507312007-05-18 13:12:10 +0000424 unicode = _PyUnicode_New(size);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000425 if (!unicode)
426 return NULL;
427
428 /* Copy the Unicode data into the new object */
429 if (u != NULL) {
Walter Dörwaldce32db32007-05-05 14:26:59 +0000430 Py_UNICODE *p = unicode->str;
431 while ((*p++ = *u++))
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000432 ;
433 }
434
435 return (PyObject *)unicode;
436}
437
Walter Dörwaldd2034312007-05-18 16:29:38 +0000438PyObject *PyUnicode_FromString(const char *u)
439{
440 size_t size = strlen(u);
441 if (size > PY_SSIZE_T_MAX) {
442 PyErr_SetString(PyExc_OverflowError, "input too long");
443 return NULL;
444 }
445
446 return PyUnicode_FromStringAndSize(u, size);
447}
448
Guido van Rossumd57fd912000-03-10 22:53:23 +0000449#ifdef HAVE_WCHAR_H
450
451PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000452 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000453{
454 PyUnicodeObject *unicode;
455
456 if (w == NULL) {
457 PyErr_BadInternalCall();
458 return NULL;
459 }
460
461 unicode = _PyUnicode_New(size);
462 if (!unicode)
463 return NULL;
464
465 /* Copy the wchar_t data into the new object */
466#ifdef HAVE_USABLE_WCHAR_T
467 memcpy(unicode->str, w, size * sizeof(wchar_t));
Tim Petersced69f82003-09-16 20:30:58 +0000468#else
Guido van Rossumd57fd912000-03-10 22:53:23 +0000469 {
470 register Py_UNICODE *u;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000471 register Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000472 u = PyUnicode_AS_UNICODE(unicode);
Marc-André Lemburg204bd6d2004-10-15 07:45:05 +0000473 for (i = size; i > 0; i--)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000474 *u++ = *w++;
475 }
476#endif
477
478 return (PyObject *)unicode;
479}
480
Walter Dörwaldd2034312007-05-18 16:29:38 +0000481#define appendstring(string) {for (copy = string;*copy;) *s++ = *copy++;}
482
483PyObject *
484PyUnicode_FromFormatV(const char *format, va_list vargs)
485{
486 va_list count;
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000487 Py_ssize_t callcount = 0;
488 PyObject **callresults = NULL;
489 PyObject **callresult;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000490 Py_ssize_t n = 0;
491 const char* f;
492 Py_UNICODE *s;
493 PyObject *string;
494 /* used by sprintf */
495 char buffer[21];
496 const char *copy;
497
498#ifdef VA_LIST_IS_ARRAY
499 Py_MEMCPY(count, vargs, sizeof(va_list));
500#else
501#ifdef __va_copy
502 __va_copy(count, vargs);
503#else
504 count = vargs;
505#endif
506#endif
Walter Dörwald1be7e3f2007-05-23 21:02:42 +0000507 /* step 1: count the number of %S/%R format specifications
508 * (we call PyObject_Unicode()/PyObject_Repr() for these objects
509 * once during step 3 and put the result in an array) */
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000510 for (f = format; *f; f++) {
Walter Dörwald1be7e3f2007-05-23 21:02:42 +0000511 if (*f == '%' && (*(f+1)=='S' || *(f+1)=='R'))
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000512 ++callcount;
513 }
Walter Dörwald1be7e3f2007-05-23 21:02:42 +0000514 /* step 2: allocate memory for the results of
515 * PyObject_Unicode()/PyObject_Repr() calls */
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000516 if (callcount) {
517 callresults = PyMem_Malloc(sizeof(PyObject *)*callcount);
518 if (!callresults) {
519 PyErr_NoMemory();
520 return NULL;
521 }
522 callresult = callresults;
523 }
524 /* step 3: figure out how large a buffer we need */
Walter Dörwaldd2034312007-05-18 16:29:38 +0000525 for (f = format; *f; f++) {
526 if (*f == '%') {
527 const char* p = f;
528 while (*++f && *f != '%' && !isalpha(Py_CHARMASK(*f)))
529 ;
530
531 /* skip the 'l' or 'z' in {%ld, %zd, %lu, %zu} since
532 * they don't affect the amount of space we reserve.
533 */
534 if ((*f == 'l' || *f == 'z') &&
535 (f[1] == 'd' || f[1] == 'u'))
536 ++f;
537
538 switch (*f) {
539 case 'c':
540 (void)va_arg(count, int);
541 /* fall through... */
542 case '%':
543 n++;
544 break;
545 case 'd': case 'u': case 'i': case 'x':
546 (void) va_arg(count, int);
547 /* 20 bytes is enough to hold a 64-bit
548 integer. Decimal takes the most space.
549 This isn't enough for octal. */
550 n += 20;
551 break;
552 case 's':
553 n += strlen(va_arg(count, char*));
554 break;
555 case 'U':
556 {
557 PyObject *obj = va_arg(count, PyObject *);
558 assert(obj && PyUnicode_Check(obj));
559 n += PyUnicode_GET_SIZE(obj);
560 break;
561 }
Walter Dörwald1be7e3f2007-05-23 21:02:42 +0000562 case 'S':
563 {
564 PyObject *obj = va_arg(count, PyObject *);
565 PyObject *str;
566 assert(obj);
567 str = PyObject_Unicode(obj);
568 if (!str)
569 goto fail;
570 n += PyUnicode_GET_SIZE(str);
571 /* Remember the str and switch to the next slot */
572 *callresult++ = str;
573 break;
574 }
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000575 case 'R':
576 {
577 PyObject *obj = va_arg(count, PyObject *);
578 PyObject *repr;
579 assert(obj);
580 repr = PyObject_Repr(obj);
581 if (!repr)
582 goto fail;
583 n += PyUnicode_GET_SIZE(repr);
584 /* Remember the repr and switch to the next slot */
585 *callresult++ = repr;
586 break;
587 }
Walter Dörwaldd2034312007-05-18 16:29:38 +0000588 case 'p':
589 (void) va_arg(count, int);
590 /* maximum 64-bit pointer representation:
591 * 0xffffffffffffffff
592 * so 19 characters is enough.
593 * XXX I count 18 -- what's the extra for?
594 */
595 n += 19;
596 break;
597 default:
598 /* if we stumble upon an unknown
599 formatting code, copy the rest of
600 the format string to the output
601 string. (we cannot just skip the
602 code, since there's no way to know
603 what's in the argument list) */
604 n += strlen(p);
605 goto expand;
606 }
607 } else
608 n++;
609 }
610 expand:
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000611 /* step 4: fill the buffer */
Walter Dörwaldd2034312007-05-18 16:29:38 +0000612 /* Since we've analyzed how much space we need for the worst case,
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000613 we don't have to resize the string.
614 There can be no errors beyond this point. */
Walter Dörwaldd2034312007-05-18 16:29:38 +0000615 string = PyUnicode_FromUnicode(NULL, n);
616 if (!string)
617 return NULL;
618
619 s = PyUnicode_AS_UNICODE(string);
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000620 callresult = callresults;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000621
622 for (f = format; *f; f++) {
623 if (*f == '%') {
624 const char* p = f++;
625 int longflag = 0;
626 int size_tflag = 0;
627 /* parse the width.precision part (we're only
628 interested in the precision value, if any) */
629 n = 0;
630 while (isdigit(Py_CHARMASK(*f)))
631 n = (n*10) + *f++ - '0';
632 if (*f == '.') {
633 f++;
634 n = 0;
635 while (isdigit(Py_CHARMASK(*f)))
636 n = (n*10) + *f++ - '0';
637 }
638 while (*f && *f != '%' && !isalpha(Py_CHARMASK(*f)))
639 f++;
640 /* handle the long flag, but only for %ld and %lu.
641 others can be added when necessary. */
642 if (*f == 'l' && (f[1] == 'd' || f[1] == 'u')) {
643 longflag = 1;
644 ++f;
645 }
646 /* handle the size_t flag. */
647 if (*f == 'z' && (f[1] == 'd' || f[1] == 'u')) {
648 size_tflag = 1;
649 ++f;
650 }
651
652 switch (*f) {
653 case 'c':
654 *s++ = va_arg(vargs, int);
655 break;
656 case 'd':
657 if (longflag)
658 sprintf(buffer, "%ld", va_arg(vargs, long));
659 else if (size_tflag)
660 sprintf(buffer, "%" PY_FORMAT_SIZE_T "d",
661 va_arg(vargs, Py_ssize_t));
662 else
663 sprintf(buffer, "%d", va_arg(vargs, int));
664 appendstring(buffer);
665 break;
666 case 'u':
667 if (longflag)
668 sprintf(buffer, "%lu",
669 va_arg(vargs, unsigned long));
670 else if (size_tflag)
671 sprintf(buffer, "%" PY_FORMAT_SIZE_T "u",
672 va_arg(vargs, size_t));
673 else
674 sprintf(buffer, "%u",
675 va_arg(vargs, unsigned int));
676 appendstring(buffer);
677 break;
678 case 'i':
679 sprintf(buffer, "%i", va_arg(vargs, int));
680 appendstring(buffer);
681 break;
682 case 'x':
683 sprintf(buffer, "%x", va_arg(vargs, int));
684 appendstring(buffer);
685 break;
686 case 's':
687 p = va_arg(vargs, char*);
688 appendstring(p);
689 break;
690 case 'U':
691 {
692 PyObject *obj = va_arg(vargs, PyObject *);
Walter Dörwald5c2fab62007-05-24 19:51:02 +0000693 Py_ssize_t size = PyUnicode_GET_SIZE(obj);
694 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size);
695 s += size;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000696 break;
697 }
Walter Dörwald1be7e3f2007-05-23 21:02:42 +0000698 case 'S':
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000699 case 'R':
700 {
701 /* unused, since we already have the result */
702 (void) va_arg(vargs, PyObject *);
703 Py_UNICODE *ucopy = PyUnicode_AS_UNICODE(*callresult);
704 Py_ssize_t usize = PyUnicode_GET_SIZE(*callresult);
705 Py_ssize_t upos;
706 for (upos = 0; upos<usize;)
707 *s++ = ucopy[upos++];
Walter Dörwald1be7e3f2007-05-23 21:02:42 +0000708 /* We're done with the unicode()/repr() => forget it */
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000709 Py_DECREF(*callresult);
Walter Dörwald1be7e3f2007-05-23 21:02:42 +0000710 /* switch to next unicode()/repr() result */
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000711 ++callresult;
712 break;
713 }
Walter Dörwaldd2034312007-05-18 16:29:38 +0000714 case 'p':
715 sprintf(buffer, "%p", va_arg(vargs, void*));
716 /* %p is ill-defined: ensure leading 0x. */
717 if (buffer[1] == 'X')
718 buffer[1] = 'x';
719 else if (buffer[1] != 'x') {
720 memmove(buffer+2, buffer, strlen(buffer)+1);
721 buffer[0] = '0';
722 buffer[1] = 'x';
723 }
724 appendstring(buffer);
725 break;
726 case '%':
727 *s++ = '%';
728 break;
729 default:
730 appendstring(p);
731 goto end;
732 }
733 } else
734 *s++ = *f;
735 }
736
737 end:
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000738 if (callresults)
739 PyMem_Free(callresults);
Walter Dörwaldd2034312007-05-18 16:29:38 +0000740 _PyUnicode_Resize(&string, s - PyUnicode_AS_UNICODE(string));
741 return string;
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000742 fail:
743 if (callresults) {
744 PyObject **callresult2 = callresults;
745 while (callresult2 <= callresult) {
746 Py_DECREF(*callresult2);
747 ++callresult2;
748 }
749 PyMem_Free(callresults);
750 }
751 return NULL;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000752}
753
754#undef appendstring
755
756PyObject *
757PyUnicode_FromFormat(const char *format, ...)
758{
759 PyObject* ret;
760 va_list vargs;
761
762#ifdef HAVE_STDARG_PROTOTYPES
763 va_start(vargs, format);
764#else
765 va_start(vargs);
766#endif
767 ret = PyUnicode_FromFormatV(format, vargs);
768 va_end(vargs);
769 return ret;
770}
771
Martin v. Löwis18e16552006-02-15 17:27:45 +0000772Py_ssize_t PyUnicode_AsWideChar(PyUnicodeObject *unicode,
773 wchar_t *w,
774 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000775{
776 if (unicode == NULL) {
777 PyErr_BadInternalCall();
778 return -1;
779 }
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +0000780
781 /* If possible, try to copy the 0-termination as well */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000782 if (size > PyUnicode_GET_SIZE(unicode))
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +0000783 size = PyUnicode_GET_SIZE(unicode) + 1;
784
Guido van Rossumd57fd912000-03-10 22:53:23 +0000785#ifdef HAVE_USABLE_WCHAR_T
786 memcpy(w, unicode->str, size * sizeof(wchar_t));
787#else
788 {
789 register Py_UNICODE *u;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000790 register Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000791 u = PyUnicode_AS_UNICODE(unicode);
Marc-André Lemburg204bd6d2004-10-15 07:45:05 +0000792 for (i = size; i > 0; i--)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000793 *w++ = *u++;
794 }
795#endif
796
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +0000797 if (size > PyUnicode_GET_SIZE(unicode))
798 return PyUnicode_GET_SIZE(unicode);
799 else
Guido van Rossumd57fd912000-03-10 22:53:23 +0000800 return size;
801}
802
803#endif
804
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000805PyObject *PyUnicode_FromOrdinal(int ordinal)
806{
Hye-Shik Chang40574832004-04-06 07:24:51 +0000807 Py_UNICODE s[1];
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000808
809#ifdef Py_UNICODE_WIDE
810 if (ordinal < 0 || ordinal > 0x10ffff) {
811 PyErr_SetString(PyExc_ValueError,
812 "unichr() arg not in range(0x110000) "
813 "(wide Python build)");
814 return NULL;
815 }
816#else
817 if (ordinal < 0 || ordinal > 0xffff) {
818 PyErr_SetString(PyExc_ValueError,
819 "unichr() arg not in range(0x10000) "
820 "(narrow Python build)");
821 return NULL;
822 }
823#endif
824
Hye-Shik Chang40574832004-04-06 07:24:51 +0000825 s[0] = (Py_UNICODE)ordinal;
826 return PyUnicode_FromUnicode(s, 1);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000827}
828
Guido van Rossumd57fd912000-03-10 22:53:23 +0000829PyObject *PyUnicode_FromObject(register PyObject *obj)
830{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000831 /* XXX Perhaps we should make this API an alias of
832 PyObject_Unicode() instead ?! */
833 if (PyUnicode_CheckExact(obj)) {
834 Py_INCREF(obj);
835 return obj;
836 }
837 if (PyUnicode_Check(obj)) {
838 /* For a Unicode subtype that's not a Unicode object,
839 return a true Unicode object with the same data. */
840 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(obj),
841 PyUnicode_GET_SIZE(obj));
842 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000843 return PyUnicode_FromEncodedObject(obj, NULL, "strict");
844}
845
846PyObject *PyUnicode_FromEncodedObject(register PyObject *obj,
847 const char *encoding,
848 const char *errors)
849{
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000850 const char *s = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +0000851 Py_ssize_t len;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000852 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +0000853
Guido van Rossumd57fd912000-03-10 22:53:23 +0000854 if (obj == NULL) {
855 PyErr_BadInternalCall();
856 return NULL;
857 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000858
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000859#if 0
860 /* For b/w compatibility we also accept Unicode objects provided
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +0000861 that no encodings is given and then redirect to
862 PyObject_Unicode() which then applies the additional logic for
863 Unicode subclasses.
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000864
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000865 NOTE: This API should really only be used for object which
866 represent *encoded* Unicode !
867
868 */
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000869 if (PyUnicode_Check(obj)) {
870 if (encoding) {
871 PyErr_SetString(PyExc_TypeError,
872 "decoding Unicode is not supported");
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000873 return NULL;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000874 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000875 return PyObject_Unicode(obj);
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000876 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000877#else
878 if (PyUnicode_Check(obj)) {
879 PyErr_SetString(PyExc_TypeError,
880 "decoding Unicode is not supported");
881 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000882 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000883#endif
884
885 /* Coerce object */
886 if (PyString_Check(obj)) {
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000887 s = PyString_AS_STRING(obj);
888 len = PyString_GET_SIZE(obj);
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000889 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000890 else if (PyObject_AsCharBuffer(obj, &s, &len)) {
891 /* Overwrite the error message with something more useful in
892 case of a TypeError. */
893 if (PyErr_ExceptionMatches(PyExc_TypeError))
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000894 PyErr_Format(PyExc_TypeError,
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000895 "coercing to Unicode: need string or buffer, "
896 "%.80s found",
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000897 obj->ob_type->tp_name);
898 goto onError;
899 }
Tim Petersced69f82003-09-16 20:30:58 +0000900
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000901 /* Convert to Unicode */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000902 if (len == 0) {
903 Py_INCREF(unicode_empty);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000904 v = (PyObject *)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000905 }
Tim Petersced69f82003-09-16 20:30:58 +0000906 else
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000907 v = PyUnicode_Decode(s, len, encoding, errors);
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +0000908
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000909 return v;
910
911 onError:
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000912 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000913}
914
915PyObject *PyUnicode_Decode(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000916 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +0000917 const char *encoding,
918 const char *errors)
919{
920 PyObject *buffer = NULL, *unicode;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000921
922 if (encoding == NULL)
Fred Drakee4315f52000-05-09 19:53:39 +0000923 encoding = PyUnicode_GetDefaultEncoding();
924
925 /* Shortcuts for common default encodings */
926 if (strcmp(encoding, "utf-8") == 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000927 return PyUnicode_DecodeUTF8(s, size, errors);
Fred Drakee4315f52000-05-09 19:53:39 +0000928 else if (strcmp(encoding, "latin-1") == 0)
929 return PyUnicode_DecodeLatin1(s, size, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +0000930#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
931 else if (strcmp(encoding, "mbcs") == 0)
932 return PyUnicode_DecodeMBCS(s, size, errors);
933#endif
Fred Drakee4315f52000-05-09 19:53:39 +0000934 else if (strcmp(encoding, "ascii") == 0)
935 return PyUnicode_DecodeASCII(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000936
937 /* Decode via the codec registry */
938 buffer = PyBuffer_FromMemory((void *)s, size);
939 if (buffer == NULL)
940 goto onError;
941 unicode = PyCodec_Decode(buffer, encoding, errors);
942 if (unicode == NULL)
943 goto onError;
944 if (!PyUnicode_Check(unicode)) {
945 PyErr_Format(PyExc_TypeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000946 "decoder did not return an unicode object (type=%.400s)",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000947 unicode->ob_type->tp_name);
948 Py_DECREF(unicode);
949 goto onError;
950 }
951 Py_DECREF(buffer);
952 return unicode;
Tim Petersced69f82003-09-16 20:30:58 +0000953
Guido van Rossumd57fd912000-03-10 22:53:23 +0000954 onError:
955 Py_XDECREF(buffer);
956 return NULL;
957}
958
Marc-André Lemburgd2d45982004-07-08 17:57:32 +0000959PyObject *PyUnicode_AsDecodedObject(PyObject *unicode,
960 const char *encoding,
961 const char *errors)
962{
963 PyObject *v;
964
965 if (!PyUnicode_Check(unicode)) {
966 PyErr_BadArgument();
967 goto onError;
968 }
969
970 if (encoding == NULL)
971 encoding = PyUnicode_GetDefaultEncoding();
972
973 /* Decode via the codec registry */
974 v = PyCodec_Decode(unicode, encoding, errors);
975 if (v == NULL)
976 goto onError;
977 return v;
978
979 onError:
980 return NULL;
981}
982
Guido van Rossumd57fd912000-03-10 22:53:23 +0000983PyObject *PyUnicode_Encode(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000984 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +0000985 const char *encoding,
986 const char *errors)
987{
988 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +0000989
Guido van Rossumd57fd912000-03-10 22:53:23 +0000990 unicode = PyUnicode_FromUnicode(s, size);
991 if (unicode == NULL)
992 return NULL;
993 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
994 Py_DECREF(unicode);
995 return v;
996}
997
Marc-André Lemburgd2d45982004-07-08 17:57:32 +0000998PyObject *PyUnicode_AsEncodedObject(PyObject *unicode,
999 const char *encoding,
1000 const char *errors)
1001{
1002 PyObject *v;
1003
1004 if (!PyUnicode_Check(unicode)) {
1005 PyErr_BadArgument();
1006 goto onError;
1007 }
1008
1009 if (encoding == NULL)
1010 encoding = PyUnicode_GetDefaultEncoding();
1011
1012 /* Encode via the codec registry */
1013 v = PyCodec_Encode(unicode, encoding, errors);
1014 if (v == NULL)
1015 goto onError;
1016 return v;
1017
1018 onError:
1019 return NULL;
1020}
1021
Guido van Rossumd57fd912000-03-10 22:53:23 +00001022PyObject *PyUnicode_AsEncodedString(PyObject *unicode,
1023 const char *encoding,
1024 const char *errors)
1025{
1026 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00001027
Guido van Rossumd57fd912000-03-10 22:53:23 +00001028 if (!PyUnicode_Check(unicode)) {
1029 PyErr_BadArgument();
1030 goto onError;
1031 }
Fred Drakee4315f52000-05-09 19:53:39 +00001032
Tim Petersced69f82003-09-16 20:30:58 +00001033 if (encoding == NULL)
Fred Drakee4315f52000-05-09 19:53:39 +00001034 encoding = PyUnicode_GetDefaultEncoding();
1035
1036 /* Shortcuts for common default encodings */
1037 if (errors == NULL) {
1038 if (strcmp(encoding, "utf-8") == 0)
Jeremy Hylton9cea41c2001-05-29 17:13:15 +00001039 return PyUnicode_AsUTF8String(unicode);
Fred Drakee4315f52000-05-09 19:53:39 +00001040 else if (strcmp(encoding, "latin-1") == 0)
1041 return PyUnicode_AsLatin1String(unicode);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001042#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
1043 else if (strcmp(encoding, "mbcs") == 0)
1044 return PyUnicode_AsMBCSString(unicode);
1045#endif
Fred Drakee4315f52000-05-09 19:53:39 +00001046 else if (strcmp(encoding, "ascii") == 0)
1047 return PyUnicode_AsASCIIString(unicode);
1048 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001049
1050 /* Encode via the codec registry */
1051 v = PyCodec_Encode(unicode, encoding, errors);
1052 if (v == NULL)
1053 goto onError;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001054 if (!PyBytes_Check(v)) {
1055 if (PyString_Check(v)) {
1056 /* Old codec, turn it into bytes */
1057 PyObject *b = PyBytes_FromObject(v);
1058 Py_DECREF(v);
1059 return b;
1060 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001061 PyErr_Format(PyExc_TypeError,
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001062 "encoder did not return a bytes object "
1063 "(type=%.400s, encoding=%.20s, errors=%.20s)",
1064 v->ob_type->tp_name,
1065 encoding ? encoding : "NULL",
1066 errors ? errors : "NULL");
Guido van Rossumd57fd912000-03-10 22:53:23 +00001067 Py_DECREF(v);
1068 goto onError;
1069 }
1070 return v;
Tim Petersced69f82003-09-16 20:30:58 +00001071
Guido van Rossumd57fd912000-03-10 22:53:23 +00001072 onError:
1073 return NULL;
1074}
1075
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001076PyObject *_PyUnicode_AsDefaultEncodedString(PyObject *unicode,
1077 const char *errors)
1078{
1079 PyObject *v = ((PyUnicodeObject *)unicode)->defenc;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001080 PyObject *b;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001081 if (v)
1082 return v;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001083 if (errors != NULL)
1084 Py_FatalError("non-NULL encoding in _PyUnicode_AsDefaultEncodedString");
1085 if (errors == NULL) {
1086 b = PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
1087 PyUnicode_GET_SIZE(unicode),
1088 NULL);
1089 }
1090 else {
1091 b = PyUnicode_AsEncodedString(unicode, NULL, errors);
1092 }
1093 if (!b)
1094 return NULL;
1095 v = PyString_FromStringAndSize(PyBytes_AsString(b),
1096 PyBytes_Size(b));
1097 Py_DECREF(b);
1098 if (!errors) {
1099 Py_XINCREF(v);
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001100 ((PyUnicodeObject *)unicode)->defenc = v;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001101 }
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001102 return v;
1103}
1104
Guido van Rossumd57fd912000-03-10 22:53:23 +00001105Py_UNICODE *PyUnicode_AsUnicode(PyObject *unicode)
1106{
1107 if (!PyUnicode_Check(unicode)) {
1108 PyErr_BadArgument();
1109 goto onError;
1110 }
1111 return PyUnicode_AS_UNICODE(unicode);
1112
1113 onError:
1114 return NULL;
1115}
1116
Martin v. Löwis18e16552006-02-15 17:27:45 +00001117Py_ssize_t PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001118{
1119 if (!PyUnicode_Check(unicode)) {
1120 PyErr_BadArgument();
1121 goto onError;
1122 }
1123 return PyUnicode_GET_SIZE(unicode);
1124
1125 onError:
1126 return -1;
1127}
1128
Thomas Wouters78890102000-07-22 19:25:51 +00001129const char *PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +00001130{
1131 return unicode_default_encoding;
1132}
1133
1134int PyUnicode_SetDefaultEncoding(const char *encoding)
1135{
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001136 if (strcmp(encoding, unicode_default_encoding) != 0) {
1137 PyErr_Format(PyExc_ValueError,
1138 "Can only set default encoding to %s",
1139 unicode_default_encoding);
1140 return -1;
1141 }
Fred Drakee4315f52000-05-09 19:53:39 +00001142 return 0;
Fred Drakee4315f52000-05-09 19:53:39 +00001143}
1144
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001145/* error handling callback helper:
1146 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +00001147 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001148 and adjust various state variables.
1149 return 0 on success, -1 on error
1150*/
1151
1152static
1153int unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
1154 const char *encoding, const char *reason,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001155 const char *input, Py_ssize_t insize, Py_ssize_t *startinpos, Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
1156 PyObject **output, Py_ssize_t *outpos, Py_UNICODE **outptr)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001157{
Martin v. Löwis18e16552006-02-15 17:27:45 +00001158 static char *argparse = "O!n;decoding error handler must return (unicode, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001159
1160 PyObject *restuple = NULL;
1161 PyObject *repunicode = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001162 Py_ssize_t outsize = PyUnicode_GET_SIZE(*output);
1163 Py_ssize_t requiredsize;
1164 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001165 Py_UNICODE *repptr;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001166 Py_ssize_t repsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001167 int res = -1;
1168
1169 if (*errorHandler == NULL) {
1170 *errorHandler = PyCodec_LookupError(errors);
1171 if (*errorHandler == NULL)
1172 goto onError;
1173 }
1174
1175 if (*exceptionObject == NULL) {
1176 *exceptionObject = PyUnicodeDecodeError_Create(
1177 encoding, input, insize, *startinpos, *endinpos, reason);
1178 if (*exceptionObject == NULL)
1179 goto onError;
1180 }
1181 else {
1182 if (PyUnicodeDecodeError_SetStart(*exceptionObject, *startinpos))
1183 goto onError;
1184 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, *endinpos))
1185 goto onError;
1186 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
1187 goto onError;
1188 }
1189
1190 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
1191 if (restuple == NULL)
1192 goto onError;
1193 if (!PyTuple_Check(restuple)) {
1194 PyErr_Format(PyExc_TypeError, &argparse[4]);
1195 goto onError;
1196 }
1197 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
1198 goto onError;
1199 if (newpos<0)
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00001200 newpos = insize+newpos;
1201 if (newpos<0 || newpos>insize) {
Martin v. Löwis2c95cc62006-02-16 06:54:25 +00001202 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00001203 goto onError;
1204 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001205
1206 /* need more space? (at least enough for what we
1207 have+the replacement+the rest of the string (starting
1208 at the new input position), so we won't have to check space
1209 when there are no errors in the rest of the string) */
1210 repptr = PyUnicode_AS_UNICODE(repunicode);
1211 repsize = PyUnicode_GET_SIZE(repunicode);
1212 requiredsize = *outpos + repsize + insize-newpos;
1213 if (requiredsize > outsize) {
1214 if (requiredsize<2*outsize)
1215 requiredsize = 2*outsize;
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00001216 if (PyUnicode_Resize(output, requiredsize) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001217 goto onError;
1218 *outptr = PyUnicode_AS_UNICODE(*output) + *outpos;
1219 }
1220 *endinpos = newpos;
1221 *inptr = input + newpos;
1222 Py_UNICODE_COPY(*outptr, repptr, repsize);
1223 *outptr += repsize;
1224 *outpos += repsize;
1225 /* we made it! */
1226 res = 0;
1227
1228 onError:
1229 Py_XDECREF(restuple);
1230 return res;
1231}
1232
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001233/* --- UTF-7 Codec -------------------------------------------------------- */
1234
1235/* see RFC2152 for details */
1236
Tim Petersced69f82003-09-16 20:30:58 +00001237static
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001238char utf7_special[128] = {
1239 /* indicate whether a UTF-7 character is special i.e. cannot be directly
1240 encoded:
1241 0 - not special
1242 1 - special
1243 2 - whitespace (optional)
1244 3 - RFC2152 Set O (optional) */
1245 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 2, 1, 1,
1246 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1247 2, 3, 3, 3, 3, 3, 3, 0, 0, 0, 3, 1, 0, 0, 0, 1,
1248 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 0,
1249 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1250 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 1, 3, 3, 3,
1251 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1252 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 1, 1,
1253
1254};
1255
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001256/* Note: The comparison (c) <= 0 is a trick to work-around gcc
1257 warnings about the comparison always being false; since
1258 utf7_special[0] is 1, we can safely make that one comparison
1259 true */
1260
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001261#define SPECIAL(c, encodeO, encodeWS) \
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001262 ((c) > 127 || (c) <= 0 || utf7_special[(c)] == 1 || \
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +00001263 (encodeWS && (utf7_special[(c)] == 2)) || \
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001264 (encodeO && (utf7_special[(c)] == 3)))
1265
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001266#define B64(n) \
1267 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
1268#define B64CHAR(c) \
1269 (isalnum(c) || (c) == '+' || (c) == '/')
1270#define UB64(c) \
1271 ((c) == '+' ? 62 : (c) == '/' ? 63 : (c) >= 'a' ? \
1272 (c) - 71 : (c) >= 'A' ? (c) - 65 : (c) + 4 )
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001273
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +00001274#define ENCODE(out, ch, bits) \
1275 while (bits >= 6) { \
1276 *out++ = B64(ch >> (bits-6)); \
1277 bits -= 6; \
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001278 }
1279
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +00001280#define DECODE(out, ch, bits, surrogate) \
1281 while (bits >= 16) { \
1282 Py_UNICODE outCh = (Py_UNICODE) ((ch >> (bits-16)) & 0xffff); \
1283 bits -= 16; \
1284 if (surrogate) { \
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001285 /* We have already generated an error for the high surrogate \
1286 so let's not bother seeing if the low surrogate is correct or not */ \
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +00001287 surrogate = 0; \
1288 } else if (0xDC00 <= outCh && outCh <= 0xDFFF) { \
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001289 /* This is a surrogate pair. Unfortunately we can't represent \
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +00001290 it in a 16-bit character */ \
1291 surrogate = 1; \
1292 errmsg = "code pairs are not supported"; \
1293 goto utf7Error; \
1294 } else { \
1295 *out++ = outCh; \
1296 } \
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001297 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001298
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001299PyObject *PyUnicode_DecodeUTF7(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001300 Py_ssize_t size,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001301 const char *errors)
1302{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001303 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001304 Py_ssize_t startinpos;
1305 Py_ssize_t endinpos;
1306 Py_ssize_t outpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001307 const char *e;
1308 PyUnicodeObject *unicode;
1309 Py_UNICODE *p;
1310 const char *errmsg = "";
1311 int inShift = 0;
1312 unsigned int bitsleft = 0;
1313 unsigned long charsleft = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001314 int surrogate = 0;
1315 PyObject *errorHandler = NULL;
1316 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001317
1318 unicode = _PyUnicode_New(size);
1319 if (!unicode)
1320 return NULL;
1321 if (size == 0)
1322 return (PyObject *)unicode;
1323
1324 p = unicode->str;
1325 e = s + size;
1326
1327 while (s < e) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001328 Py_UNICODE ch;
1329 restart:
1330 ch = *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001331
1332 if (inShift) {
1333 if ((ch == '-') || !B64CHAR(ch)) {
1334 inShift = 0;
1335 s++;
Tim Petersced69f82003-09-16 20:30:58 +00001336
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001337 /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
1338 if (bitsleft >= 6) {
1339 /* The shift sequence has a partial character in it. If
1340 bitsleft < 6 then we could just classify it as padding
1341 but that is not the case here */
1342
1343 errmsg = "partial character in shift sequence";
Tim Petersced69f82003-09-16 20:30:58 +00001344 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001345 }
1346 /* According to RFC2152 the remaining bits should be zero. We
Tim Petersced69f82003-09-16 20:30:58 +00001347 choose to signal an error/insert a replacement character
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001348 here so indicate the potential of a misencoded character. */
1349
1350 /* On x86, a << b == a << (b%32) so make sure that bitsleft != 0 */
1351 if (bitsleft && charsleft << (sizeof(charsleft) * 8 - bitsleft)) {
1352 errmsg = "non-zero padding bits in shift sequence";
Tim Petersced69f82003-09-16 20:30:58 +00001353 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001354 }
1355
1356 if (ch == '-') {
1357 if ((s < e) && (*(s) == '-')) {
Tim Petersced69f82003-09-16 20:30:58 +00001358 *p++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001359 inShift = 1;
1360 }
1361 } else if (SPECIAL(ch,0,0)) {
1362 errmsg = "unexpected special character";
Tim Petersced69f82003-09-16 20:30:58 +00001363 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001364 } else {
1365 *p++ = ch;
1366 }
1367 } else {
1368 charsleft = (charsleft << 6) | UB64(ch);
1369 bitsleft += 6;
1370 s++;
1371 /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
1372 }
1373 }
1374 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001375 startinpos = s-starts;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001376 s++;
1377 if (s < e && *s == '-') {
1378 s++;
1379 *p++ = '+';
1380 } else
1381 {
1382 inShift = 1;
1383 bitsleft = 0;
1384 }
1385 }
1386 else if (SPECIAL(ch,0,0)) {
1387 errmsg = "unexpected special character";
1388 s++;
Tim Petersced69f82003-09-16 20:30:58 +00001389 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001390 }
1391 else {
1392 *p++ = ch;
1393 s++;
1394 }
1395 continue;
1396 utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001397 outpos = p-PyUnicode_AS_UNICODE(unicode);
1398 endinpos = s-starts;
1399 if (unicode_decode_call_errorhandler(
1400 errors, &errorHandler,
1401 "utf7", errmsg,
1402 starts, size, &startinpos, &endinpos, &exc, &s,
1403 (PyObject **)&unicode, &outpos, &p))
1404 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001405 }
1406
1407 if (inShift) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001408 outpos = p-PyUnicode_AS_UNICODE(unicode);
1409 endinpos = size;
1410 if (unicode_decode_call_errorhandler(
1411 errors, &errorHandler,
1412 "utf7", "unterminated shift sequence",
1413 starts, size, &startinpos, &endinpos, &exc, &s,
1414 (PyObject **)&unicode, &outpos, &p))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001415 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001416 if (s < e)
1417 goto restart;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001418 }
1419
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00001420 if (_PyUnicode_Resize(&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001421 goto onError;
1422
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001423 Py_XDECREF(errorHandler);
1424 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001425 return (PyObject *)unicode;
1426
1427onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001428 Py_XDECREF(errorHandler);
1429 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001430 Py_DECREF(unicode);
1431 return NULL;
1432}
1433
1434
1435PyObject *PyUnicode_EncodeUTF7(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001436 Py_ssize_t size,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001437 int encodeSetO,
1438 int encodeWhiteSpace,
1439 const char *errors)
1440{
1441 PyObject *v;
1442 /* It might be possible to tighten this worst case */
Martin v. Löwis18e16552006-02-15 17:27:45 +00001443 Py_ssize_t cbAllocated = 5 * size;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001444 int inShift = 0;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001445 Py_ssize_t i = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001446 unsigned int bitsleft = 0;
1447 unsigned long charsleft = 0;
1448 char * out;
1449 char * start;
1450
1451 if (size == 0)
Walter Dörwald51ab4142007-05-05 14:43:36 +00001452 return PyBytes_FromStringAndSize(NULL, 0);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001453
Walter Dörwald51ab4142007-05-05 14:43:36 +00001454 v = PyBytes_FromStringAndSize(NULL, cbAllocated);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001455 if (v == NULL)
1456 return NULL;
1457
Walter Dörwald51ab4142007-05-05 14:43:36 +00001458 start = out = PyBytes_AS_STRING(v);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001459 for (;i < size; ++i) {
1460 Py_UNICODE ch = s[i];
1461
1462 if (!inShift) {
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001463 if (ch == '+') {
1464 *out++ = '+';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001465 *out++ = '-';
1466 } else if (SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
1467 charsleft = ch;
1468 bitsleft = 16;
1469 *out++ = '+';
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001470 /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001471 inShift = bitsleft > 0;
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001472 } else {
1473 *out++ = (char) ch;
1474 }
1475 } else {
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001476 if (!SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
1477 *out++ = B64(charsleft << (6-bitsleft));
1478 charsleft = 0;
1479 bitsleft = 0;
1480 /* Characters not in the BASE64 set implicitly unshift the sequence
1481 so no '-' is required, except if the character is itself a '-' */
1482 if (B64CHAR(ch) || ch == '-') {
1483 *out++ = '-';
1484 }
1485 inShift = 0;
1486 *out++ = (char) ch;
1487 } else {
1488 bitsleft += 16;
1489 charsleft = (charsleft << 16) | ch;
1490 /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
1491
1492 /* If the next character is special then we dont' need to terminate
Tim Petersced69f82003-09-16 20:30:58 +00001493 the shift sequence. If the next character is not a BASE64 character
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001494 or '-' then the shift sequence will be terminated implicitly and we
1495 don't have to insert a '-'. */
1496
1497 if (bitsleft == 0) {
1498 if (i + 1 < size) {
1499 Py_UNICODE ch2 = s[i+1];
1500
1501 if (SPECIAL(ch2, encodeSetO, encodeWhiteSpace)) {
Tim Petersced69f82003-09-16 20:30:58 +00001502
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001503 } else if (B64CHAR(ch2) || ch2 == '-') {
1504 *out++ = '-';
1505 inShift = 0;
1506 } else {
1507 inShift = 0;
1508 }
1509
1510 }
1511 else {
1512 *out++ = '-';
1513 inShift = 0;
1514 }
1515 }
Tim Petersced69f82003-09-16 20:30:58 +00001516 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001517 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001518 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001519 if (bitsleft) {
1520 *out++= B64(charsleft << (6-bitsleft) );
1521 *out++ = '-';
1522 }
1523
Walter Dörwald51ab4142007-05-05 14:43:36 +00001524 if (PyBytes_Resize(v, out - start)) {
1525 Py_DECREF(v);
1526 return NULL;
1527 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001528 return v;
1529}
1530
1531#undef SPECIAL
1532#undef B64
1533#undef B64CHAR
1534#undef UB64
1535#undef ENCODE
1536#undef DECODE
1537
Guido van Rossumd57fd912000-03-10 22:53:23 +00001538/* --- UTF-8 Codec -------------------------------------------------------- */
1539
Tim Petersced69f82003-09-16 20:30:58 +00001540static
Guido van Rossumd57fd912000-03-10 22:53:23 +00001541char utf8_code_length[256] = {
1542 /* Map UTF-8 encoded prefix byte to sequence length. zero means
1543 illegal prefix. see RFC 2279 for details */
1544 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1545 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1546 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1547 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1548 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1549 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1550 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1551 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1552 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1553 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1554 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1555 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1556 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1557 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1558 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
1559 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0
1560};
1561
Guido van Rossumd57fd912000-03-10 22:53:23 +00001562PyObject *PyUnicode_DecodeUTF8(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001563 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00001564 const char *errors)
1565{
Walter Dörwald69652032004-09-07 20:24:22 +00001566 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
1567}
1568
1569PyObject *PyUnicode_DecodeUTF8Stateful(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001570 Py_ssize_t size,
Walter Dörwald69652032004-09-07 20:24:22 +00001571 const char *errors,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001572 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00001573{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001574 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001575 int n;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001576 Py_ssize_t startinpos;
1577 Py_ssize_t endinpos;
1578 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001579 const char *e;
1580 PyUnicodeObject *unicode;
1581 Py_UNICODE *p;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001582 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001583 PyObject *errorHandler = NULL;
1584 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001585
1586 /* Note: size will always be longer than the resulting Unicode
1587 character count */
1588 unicode = _PyUnicode_New(size);
1589 if (!unicode)
1590 return NULL;
Walter Dörwald69652032004-09-07 20:24:22 +00001591 if (size == 0) {
1592 if (consumed)
1593 *consumed = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001594 return (PyObject *)unicode;
Walter Dörwald69652032004-09-07 20:24:22 +00001595 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001596
1597 /* Unpack UTF-8 encoded data */
1598 p = unicode->str;
1599 e = s + size;
1600
1601 while (s < e) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001602 Py_UCS4 ch = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001603
1604 if (ch < 0x80) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001605 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001606 s++;
1607 continue;
1608 }
1609
1610 n = utf8_code_length[ch];
1611
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001612 if (s + n > e) {
Walter Dörwald69652032004-09-07 20:24:22 +00001613 if (consumed)
1614 break;
1615 else {
1616 errmsg = "unexpected end of data";
1617 startinpos = s-starts;
1618 endinpos = size;
1619 goto utf8Error;
1620 }
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001621 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001622
1623 switch (n) {
1624
1625 case 0:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001626 errmsg = "unexpected code byte";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001627 startinpos = s-starts;
1628 endinpos = startinpos+1;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001629 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001630
1631 case 1:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001632 errmsg = "internal error";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001633 startinpos = s-starts;
1634 endinpos = startinpos+1;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001635 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001636
1637 case 2:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001638 if ((s[1] & 0xc0) != 0x80) {
1639 errmsg = "invalid data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001640 startinpos = s-starts;
1641 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001642 goto utf8Error;
1643 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001644 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001645 if (ch < 0x80) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001646 startinpos = s-starts;
1647 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001648 errmsg = "illegal encoding";
1649 goto utf8Error;
1650 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001651 else
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001652 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001653 break;
1654
1655 case 3:
Tim Petersced69f82003-09-16 20:30:58 +00001656 if ((s[1] & 0xc0) != 0x80 ||
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001657 (s[2] & 0xc0) != 0x80) {
1658 errmsg = "invalid data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001659 startinpos = s-starts;
1660 endinpos = startinpos+3;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001661 goto utf8Error;
1662 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001663 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001664 if (ch < 0x0800) {
1665 /* Note: UTF-8 encodings of surrogates are considered
Tim Petersced69f82003-09-16 20:30:58 +00001666 legal UTF-8 sequences;
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001667
1668 XXX For wide builds (UCS-4) we should probably try
1669 to recombine the surrogates into a single code
1670 unit.
1671 */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001672 errmsg = "illegal encoding";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001673 startinpos = s-starts;
1674 endinpos = startinpos+3;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001675 goto utf8Error;
1676 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001677 else
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001678 *p++ = (Py_UNICODE)ch;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001679 break;
1680
1681 case 4:
1682 if ((s[1] & 0xc0) != 0x80 ||
1683 (s[2] & 0xc0) != 0x80 ||
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001684 (s[3] & 0xc0) != 0x80) {
1685 errmsg = "invalid data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001686 startinpos = s-starts;
1687 endinpos = startinpos+4;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001688 goto utf8Error;
1689 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001690 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
1691 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
1692 /* validate and convert to UTF-16 */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001693 if ((ch < 0x10000) /* minimum value allowed for 4
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001694 byte encoding */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001695 || (ch > 0x10ffff)) /* maximum value allowed for
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001696 UTF-16 */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001697 {
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001698 errmsg = "illegal encoding";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001699 startinpos = s-starts;
1700 endinpos = startinpos+4;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001701 goto utf8Error;
1702 }
Fredrik Lundh8f455852001-06-27 18:59:43 +00001703#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001704 *p++ = (Py_UNICODE)ch;
1705#else
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001706 /* compute and append the two surrogates: */
Tim Petersced69f82003-09-16 20:30:58 +00001707
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001708 /* translate from 10000..10FFFF to 0..FFFF */
1709 ch -= 0x10000;
Tim Petersced69f82003-09-16 20:30:58 +00001710
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001711 /* high surrogate = top 10 bits added to D800 */
1712 *p++ = (Py_UNICODE)(0xD800 + (ch >> 10));
Tim Petersced69f82003-09-16 20:30:58 +00001713
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001714 /* low surrogate = bottom 10 bits added to DC00 */
Fredrik Lundh45714e92001-06-26 16:39:36 +00001715 *p++ = (Py_UNICODE)(0xDC00 + (ch & 0x03FF));
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001716#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00001717 break;
1718
1719 default:
1720 /* Other sizes are only needed for UCS-4 */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001721 errmsg = "unsupported Unicode code range";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001722 startinpos = s-starts;
1723 endinpos = startinpos+n;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001724 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001725 }
1726 s += n;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001727 continue;
Tim Petersced69f82003-09-16 20:30:58 +00001728
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001729 utf8Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001730 outpos = p-PyUnicode_AS_UNICODE(unicode);
1731 if (unicode_decode_call_errorhandler(
1732 errors, &errorHandler,
1733 "utf8", errmsg,
1734 starts, size, &startinpos, &endinpos, &exc, &s,
1735 (PyObject **)&unicode, &outpos, &p))
1736 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001737 }
Walter Dörwald69652032004-09-07 20:24:22 +00001738 if (consumed)
1739 *consumed = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001740
1741 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00001742 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001743 goto onError;
1744
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001745 Py_XDECREF(errorHandler);
1746 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001747 return (PyObject *)unicode;
1748
1749onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001750 Py_XDECREF(errorHandler);
1751 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001752 Py_DECREF(unicode);
1753 return NULL;
1754}
1755
Tim Peters602f7402002-04-27 18:03:26 +00001756/* Allocation strategy: if the string is short, convert into a stack buffer
1757 and allocate exactly as much space needed at the end. Else allocate the
1758 maximum possible needed (4 result bytes per Unicode character), and return
1759 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001760*/
Tim Peters7e3d9612002-04-21 03:26:37 +00001761PyObject *
1762PyUnicode_EncodeUTF8(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001763 Py_ssize_t size,
Tim Peters7e3d9612002-04-21 03:26:37 +00001764 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001765{
Tim Peters602f7402002-04-27 18:03:26 +00001766#define MAX_SHORT_UNICHARS 300 /* largest size we'll do on the stack */
Tim Peters0eca65c2002-04-21 17:28:06 +00001767
Martin v. Löwis18e16552006-02-15 17:27:45 +00001768 Py_ssize_t i; /* index into s of next input byte */
Tim Peters602f7402002-04-27 18:03:26 +00001769 PyObject *v; /* result string object */
1770 char *p; /* next free byte in output buffer */
Martin v. Löwis18e16552006-02-15 17:27:45 +00001771 Py_ssize_t nallocated; /* number of result bytes allocated */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001772 Py_ssize_t nneeded; /* number of result bytes needed */
Tim Peters602f7402002-04-27 18:03:26 +00001773 char stackbuf[MAX_SHORT_UNICHARS * 4];
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001774
Tim Peters602f7402002-04-27 18:03:26 +00001775 assert(s != NULL);
1776 assert(size >= 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001777
Tim Peters602f7402002-04-27 18:03:26 +00001778 if (size <= MAX_SHORT_UNICHARS) {
1779 /* Write into the stack buffer; nallocated can't overflow.
1780 * At the end, we'll allocate exactly as much heap space as it
1781 * turns out we need.
1782 */
1783 nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int);
1784 v = NULL; /* will allocate after we're done */
1785 p = stackbuf;
1786 }
1787 else {
1788 /* Overallocate on the heap, and give the excess back at the end. */
1789 nallocated = size * 4;
1790 if (nallocated / 4 != size) /* overflow! */
1791 return PyErr_NoMemory();
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001792 v = PyBytes_FromStringAndSize(NULL, nallocated);
Tim Peters602f7402002-04-27 18:03:26 +00001793 if (v == NULL)
1794 return NULL;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001795 p = PyBytes_AS_STRING(v);
Tim Peters602f7402002-04-27 18:03:26 +00001796 }
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001797
Tim Peters602f7402002-04-27 18:03:26 +00001798 for (i = 0; i < size;) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001799 Py_UCS4 ch = s[i++];
Marc-André Lemburg3688a882002-02-06 18:09:02 +00001800
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001801 if (ch < 0x80)
Tim Peters602f7402002-04-27 18:03:26 +00001802 /* Encode ASCII */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001803 *p++ = (char) ch;
Marc-André Lemburg3688a882002-02-06 18:09:02 +00001804
Guido van Rossumd57fd912000-03-10 22:53:23 +00001805 else if (ch < 0x0800) {
Tim Peters602f7402002-04-27 18:03:26 +00001806 /* Encode Latin-1 */
Marc-André Lemburgdc724d62002-02-06 18:20:19 +00001807 *p++ = (char)(0xc0 | (ch >> 6));
1808 *p++ = (char)(0x80 | (ch & 0x3f));
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001809 }
Marc-André Lemburg3688a882002-02-06 18:09:02 +00001810 else {
Tim Peters602f7402002-04-27 18:03:26 +00001811 /* Encode UCS2 Unicode ordinals */
1812 if (ch < 0x10000) {
1813 /* Special case: check for high surrogate */
1814 if (0xD800 <= ch && ch <= 0xDBFF && i != size) {
1815 Py_UCS4 ch2 = s[i];
1816 /* Check for low surrogate and combine the two to
1817 form a UCS4 value */
1818 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001819 ch = ((ch - 0xD800) << 10 | (ch2 - 0xDC00)) + 0x10000;
Tim Peters602f7402002-04-27 18:03:26 +00001820 i++;
1821 goto encodeUCS4;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001822 }
Tim Peters602f7402002-04-27 18:03:26 +00001823 /* Fall through: handles isolated high surrogates */
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001824 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001825 *p++ = (char)(0xe0 | (ch >> 12));
Tim Peters602f7402002-04-27 18:03:26 +00001826 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
1827 *p++ = (char)(0x80 | (ch & 0x3f));
1828 continue;
1829 }
1830encodeUCS4:
1831 /* Encode UCS4 Unicode ordinals */
1832 *p++ = (char)(0xf0 | (ch >> 18));
1833 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
1834 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
1835 *p++ = (char)(0x80 | (ch & 0x3f));
1836 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001837 }
Tim Peters0eca65c2002-04-21 17:28:06 +00001838
Tim Peters602f7402002-04-27 18:03:26 +00001839 if (v == NULL) {
1840 /* This was stack allocated. */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001841 nneeded = p - stackbuf;
Tim Peters602f7402002-04-27 18:03:26 +00001842 assert(nneeded <= nallocated);
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001843 v = PyBytes_FromStringAndSize(stackbuf, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00001844 }
1845 else {
1846 /* Cut back to size actually needed. */
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001847 nneeded = p - PyBytes_AS_STRING(v);
Tim Peters602f7402002-04-27 18:03:26 +00001848 assert(nneeded <= nallocated);
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001849 PyBytes_Resize(v, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00001850 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001851 return v;
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001852
Tim Peters602f7402002-04-27 18:03:26 +00001853#undef MAX_SHORT_UNICHARS
Guido van Rossumd57fd912000-03-10 22:53:23 +00001854}
1855
Guido van Rossumd57fd912000-03-10 22:53:23 +00001856PyObject *PyUnicode_AsUTF8String(PyObject *unicode)
1857{
Guido van Rossumd57fd912000-03-10 22:53:23 +00001858 if (!PyUnicode_Check(unicode)) {
1859 PyErr_BadArgument();
1860 return NULL;
1861 }
Barry Warsaw2dd4abf2000-08-18 06:58:15 +00001862 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
1863 PyUnicode_GET_SIZE(unicode),
1864 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001865}
1866
1867/* --- UTF-16 Codec ------------------------------------------------------- */
1868
Tim Peters772747b2001-08-09 22:21:55 +00001869PyObject *
1870PyUnicode_DecodeUTF16(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001871 Py_ssize_t size,
Tim Peters772747b2001-08-09 22:21:55 +00001872 const char *errors,
1873 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001874{
Walter Dörwald69652032004-09-07 20:24:22 +00001875 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
1876}
1877
1878PyObject *
1879PyUnicode_DecodeUTF16Stateful(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001880 Py_ssize_t size,
Walter Dörwald69652032004-09-07 20:24:22 +00001881 const char *errors,
1882 int *byteorder,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001883 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00001884{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001885 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001886 Py_ssize_t startinpos;
1887 Py_ssize_t endinpos;
1888 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001889 PyUnicodeObject *unicode;
1890 Py_UNICODE *p;
Tim Peters772747b2001-08-09 22:21:55 +00001891 const unsigned char *q, *e;
1892 int bo = 0; /* assume native ordering by default */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001893 const char *errmsg = "";
Tim Peters772747b2001-08-09 22:21:55 +00001894 /* Offsets from q for retrieving byte pairs in the right order. */
1895#ifdef BYTEORDER_IS_LITTLE_ENDIAN
1896 int ihi = 1, ilo = 0;
1897#else
1898 int ihi = 0, ilo = 1;
1899#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001900 PyObject *errorHandler = NULL;
1901 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001902
1903 /* Note: size will always be longer than the resulting Unicode
1904 character count */
1905 unicode = _PyUnicode_New(size);
1906 if (!unicode)
1907 return NULL;
1908 if (size == 0)
1909 return (PyObject *)unicode;
1910
1911 /* Unpack UTF-16 encoded data */
1912 p = unicode->str;
Tim Peters772747b2001-08-09 22:21:55 +00001913 q = (unsigned char *)s;
1914 e = q + size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001915
1916 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00001917 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001918
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001919 /* Check for BOM marks (U+FEFF) in the input and adjust current
1920 byte order setting accordingly. In native mode, the leading BOM
1921 mark is skipped, in all other modes, it is copied to the output
1922 stream as-is (giving a ZWNBSP character). */
1923 if (bo == 0) {
Walter Dörwald69652032004-09-07 20:24:22 +00001924 if (size >= 2) {
1925 const Py_UNICODE bom = (q[ihi] << 8) | q[ilo];
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001926#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Walter Dörwald69652032004-09-07 20:24:22 +00001927 if (bom == 0xFEFF) {
1928 q += 2;
1929 bo = -1;
1930 }
1931 else if (bom == 0xFFFE) {
1932 q += 2;
1933 bo = 1;
1934 }
Tim Petersced69f82003-09-16 20:30:58 +00001935#else
Walter Dörwald69652032004-09-07 20:24:22 +00001936 if (bom == 0xFEFF) {
1937 q += 2;
1938 bo = 1;
1939 }
1940 else if (bom == 0xFFFE) {
1941 q += 2;
1942 bo = -1;
1943 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001944#endif
Walter Dörwald69652032004-09-07 20:24:22 +00001945 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001946 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001947
Tim Peters772747b2001-08-09 22:21:55 +00001948 if (bo == -1) {
1949 /* force LE */
1950 ihi = 1;
1951 ilo = 0;
1952 }
1953 else if (bo == 1) {
1954 /* force BE */
1955 ihi = 0;
1956 ilo = 1;
1957 }
1958
1959 while (q < e) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001960 Py_UNICODE ch;
Walter Dörwald69652032004-09-07 20:24:22 +00001961 /* remaining bytes at the end? (size should be even) */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001962 if (e-q<2) {
Walter Dörwald69652032004-09-07 20:24:22 +00001963 if (consumed)
1964 break;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001965 errmsg = "truncated data";
1966 startinpos = ((const char *)q)-starts;
1967 endinpos = ((const char *)e)-starts;
1968 goto utf16Error;
1969 /* The remaining input chars are ignored if the callback
1970 chooses to skip the input */
1971 }
1972 ch = (q[ihi] << 8) | q[ilo];
1973
Tim Peters772747b2001-08-09 22:21:55 +00001974 q += 2;
1975
Guido van Rossumd57fd912000-03-10 22:53:23 +00001976 if (ch < 0xD800 || ch > 0xDFFF) {
1977 *p++ = ch;
1978 continue;
1979 }
1980
1981 /* UTF-16 code pair: */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001982 if (q >= e) {
1983 errmsg = "unexpected end of data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001984 startinpos = (((const char *)q)-2)-starts;
1985 endinpos = ((const char *)e)-starts;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001986 goto utf16Error;
1987 }
Martin v. Löwisac93bc22001-06-26 22:43:40 +00001988 if (0xD800 <= ch && ch <= 0xDBFF) {
Tim Peters772747b2001-08-09 22:21:55 +00001989 Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo];
1990 q += 2;
Martin v. Löwisac93bc22001-06-26 22:43:40 +00001991 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Fredrik Lundh8f455852001-06-27 18:59:43 +00001992#ifndef Py_UNICODE_WIDE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001993 *p++ = ch;
1994 *p++ = ch2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001995#else
1996 *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001997#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001998 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001999 }
2000 else {
2001 errmsg = "illegal UTF-16 surrogate";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002002 startinpos = (((const char *)q)-4)-starts;
2003 endinpos = startinpos+2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002004 goto utf16Error;
2005 }
2006
Guido van Rossumd57fd912000-03-10 22:53:23 +00002007 }
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002008 errmsg = "illegal encoding";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002009 startinpos = (((const char *)q)-2)-starts;
2010 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002011 /* Fall through to report the error */
2012
2013 utf16Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002014 outpos = p-PyUnicode_AS_UNICODE(unicode);
2015 if (unicode_decode_call_errorhandler(
2016 errors, &errorHandler,
2017 "utf16", errmsg,
2018 starts, size, &startinpos, &endinpos, &exc, (const char **)&q,
2019 (PyObject **)&unicode, &outpos, &p))
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002020 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002021 }
2022
2023 if (byteorder)
2024 *byteorder = bo;
2025
Walter Dörwald69652032004-09-07 20:24:22 +00002026 if (consumed)
2027 *consumed = (const char *)q-starts;
2028
Guido van Rossumd57fd912000-03-10 22:53:23 +00002029 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00002030 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002031 goto onError;
2032
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002033 Py_XDECREF(errorHandler);
2034 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002035 return (PyObject *)unicode;
2036
2037onError:
2038 Py_DECREF(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002039 Py_XDECREF(errorHandler);
2040 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002041 return NULL;
2042}
2043
Tim Peters772747b2001-08-09 22:21:55 +00002044PyObject *
2045PyUnicode_EncodeUTF16(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002046 Py_ssize_t size,
Tim Peters772747b2001-08-09 22:21:55 +00002047 const char *errors,
2048 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002049{
2050 PyObject *v;
Tim Peters772747b2001-08-09 22:21:55 +00002051 unsigned char *p;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002052#ifdef Py_UNICODE_WIDE
Tim Peters772747b2001-08-09 22:21:55 +00002053 int i, pairs;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002054#else
2055 const int pairs = 0;
2056#endif
Tim Peters772747b2001-08-09 22:21:55 +00002057 /* Offsets from p for storing byte pairs in the right order. */
2058#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2059 int ihi = 1, ilo = 0;
2060#else
2061 int ihi = 0, ilo = 1;
2062#endif
2063
2064#define STORECHAR(CH) \
2065 do { \
2066 p[ihi] = ((CH) >> 8) & 0xff; \
2067 p[ilo] = (CH) & 0xff; \
2068 p += 2; \
2069 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002070
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002071#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002072 for (i = pairs = 0; i < size; i++)
2073 if (s[i] >= 0x10000)
2074 pairs++;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002075#endif
Walter Dörwald3cc34522007-05-04 10:48:27 +00002076 v = PyBytes_FromStringAndSize(NULL,
Tim Peters772747b2001-08-09 22:21:55 +00002077 2 * (size + pairs + (byteorder == 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +00002078 if (v == NULL)
2079 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002080
Walter Dörwald3cc34522007-05-04 10:48:27 +00002081 p = (unsigned char *)PyBytes_AS_STRING(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002082 if (byteorder == 0)
Tim Peters772747b2001-08-09 22:21:55 +00002083 STORECHAR(0xFEFF);
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00002084 if (size == 0)
Marc-André Lemburgb7520772000-08-14 11:29:19 +00002085 return v;
Tim Peters772747b2001-08-09 22:21:55 +00002086
2087 if (byteorder == -1) {
2088 /* force LE */
2089 ihi = 1;
2090 ilo = 0;
2091 }
2092 else if (byteorder == 1) {
2093 /* force BE */
2094 ihi = 0;
2095 ilo = 1;
2096 }
2097
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002098 while (size-- > 0) {
2099 Py_UNICODE ch = *s++;
2100 Py_UNICODE ch2 = 0;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002101#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002102 if (ch >= 0x10000) {
Tim Peters772747b2001-08-09 22:21:55 +00002103 ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF);
2104 ch = 0xD800 | ((ch-0x10000) >> 10);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002105 }
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002106#endif
Tim Peters772747b2001-08-09 22:21:55 +00002107 STORECHAR(ch);
2108 if (ch2)
2109 STORECHAR(ch2);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002110 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002111 return v;
Tim Peters772747b2001-08-09 22:21:55 +00002112#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00002113}
2114
2115PyObject *PyUnicode_AsUTF16String(PyObject *unicode)
2116{
2117 if (!PyUnicode_Check(unicode)) {
2118 PyErr_BadArgument();
2119 return NULL;
2120 }
2121 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
2122 PyUnicode_GET_SIZE(unicode),
2123 NULL,
2124 0);
2125}
2126
2127/* --- Unicode Escape Codec ----------------------------------------------- */
2128
Fredrik Lundh06d12682001-01-24 07:59:11 +00002129static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00002130
Guido van Rossumd57fd912000-03-10 22:53:23 +00002131PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002132 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00002133 const char *errors)
2134{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002135 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002136 Py_ssize_t startinpos;
2137 Py_ssize_t endinpos;
2138 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002139 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002140 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002141 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002142 const char *end;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002143 char* message;
2144 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002145 PyObject *errorHandler = NULL;
2146 PyObject *exc = NULL;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002147
Guido van Rossumd57fd912000-03-10 22:53:23 +00002148 /* Escaped strings will always be longer than the resulting
2149 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002150 length after conversion to the true value.
2151 (but if the error callback returns a long replacement string
2152 we'll have to allocate more space) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002153 v = _PyUnicode_New(size);
2154 if (v == NULL)
2155 goto onError;
2156 if (size == 0)
2157 return (PyObject *)v;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002158
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002159 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002160 end = s + size;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002161
Guido van Rossumd57fd912000-03-10 22:53:23 +00002162 while (s < end) {
2163 unsigned char c;
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00002164 Py_UNICODE x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002165 int digits;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002166
2167 /* Non-escape characters are interpreted as Unicode ordinals */
2168 if (*s != '\\') {
Fredrik Lundhccc74732001-02-18 22:13:49 +00002169 *p++ = (unsigned char) *s++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002170 continue;
2171 }
2172
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002173 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002174 /* \ - Escapes */
2175 s++;
2176 switch (*s++) {
2177
2178 /* \x escapes */
2179 case '\n': break;
2180 case '\\': *p++ = '\\'; break;
2181 case '\'': *p++ = '\''; break;
2182 case '\"': *p++ = '\"'; break;
2183 case 'b': *p++ = '\b'; break;
2184 case 'f': *p++ = '\014'; break; /* FF */
2185 case 't': *p++ = '\t'; break;
2186 case 'n': *p++ = '\n'; break;
2187 case 'r': *p++ = '\r'; break;
2188 case 'v': *p++ = '\013'; break; /* VT */
2189 case 'a': *p++ = '\007'; break; /* BEL, not classic C */
2190
2191 /* \OOO (octal) escapes */
2192 case '0': case '1': case '2': case '3':
2193 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00002194 x = s[-1] - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00002195 if ('0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00002196 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00002197 if ('0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00002198 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00002199 }
Guido van Rossum0e4f6572000-05-01 21:27:20 +00002200 *p++ = x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002201 break;
2202
Fredrik Lundhccc74732001-02-18 22:13:49 +00002203 /* hex escapes */
2204 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002205 case 'x':
Fredrik Lundhccc74732001-02-18 22:13:49 +00002206 digits = 2;
2207 message = "truncated \\xXX escape";
2208 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002209
Fredrik Lundhccc74732001-02-18 22:13:49 +00002210 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002211 case 'u':
Fredrik Lundhccc74732001-02-18 22:13:49 +00002212 digits = 4;
2213 message = "truncated \\uXXXX escape";
2214 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002215
Fredrik Lundhccc74732001-02-18 22:13:49 +00002216 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00002217 case 'U':
Fredrik Lundhccc74732001-02-18 22:13:49 +00002218 digits = 8;
2219 message = "truncated \\UXXXXXXXX escape";
2220 hexescape:
2221 chr = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002222 outpos = p-PyUnicode_AS_UNICODE(v);
2223 if (s+digits>end) {
2224 endinpos = size;
2225 if (unicode_decode_call_errorhandler(
2226 errors, &errorHandler,
2227 "unicodeescape", "end of string in escape sequence",
2228 starts, size, &startinpos, &endinpos, &exc, &s,
2229 (PyObject **)&v, &outpos, &p))
2230 goto onError;
2231 goto nextByte;
2232 }
2233 for (i = 0; i < digits; ++i) {
Fredrik Lundhccc74732001-02-18 22:13:49 +00002234 c = (unsigned char) s[i];
Fredrik Lundhdf846752000-09-03 11:29:49 +00002235 if (!isxdigit(c)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002236 endinpos = (s+i+1)-starts;
2237 if (unicode_decode_call_errorhandler(
2238 errors, &errorHandler,
2239 "unicodeescape", message,
2240 starts, size, &startinpos, &endinpos, &exc, &s,
2241 (PyObject **)&v, &outpos, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00002242 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002243 goto nextByte;
Fredrik Lundhdf846752000-09-03 11:29:49 +00002244 }
2245 chr = (chr<<4) & ~0xF;
2246 if (c >= '0' && c <= '9')
2247 chr += c - '0';
2248 else if (c >= 'a' && c <= 'f')
2249 chr += 10 + c - 'a';
2250 else
2251 chr += 10 + c - 'A';
2252 }
2253 s += i;
Jeremy Hylton504de6b2003-10-06 05:08:26 +00002254 if (chr == 0xffffffff && PyErr_Occurred())
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002255 /* _decoding_error will have already written into the
2256 target buffer. */
2257 break;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002258 store:
Fredrik Lundhdf846752000-09-03 11:29:49 +00002259 /* when we get here, chr is a 32-bit unicode character */
2260 if (chr <= 0xffff)
2261 /* UCS-2 character */
2262 *p++ = (Py_UNICODE) chr;
2263 else if (chr <= 0x10ffff) {
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002264 /* UCS-4 character. Either store directly, or as
Walter Dörwald8c077222002-03-25 11:16:18 +00002265 surrogate pair. */
Fredrik Lundh8f455852001-06-27 18:59:43 +00002266#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002267 *p++ = chr;
2268#else
Fredrik Lundhdf846752000-09-03 11:29:49 +00002269 chr -= 0x10000L;
2270 *p++ = 0xD800 + (Py_UNICODE) (chr >> 10);
Fredrik Lundh45714e92001-06-26 16:39:36 +00002271 *p++ = 0xDC00 + (Py_UNICODE) (chr & 0x03FF);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002272#endif
Fredrik Lundhdf846752000-09-03 11:29:49 +00002273 } else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002274 endinpos = s-starts;
2275 outpos = p-PyUnicode_AS_UNICODE(v);
2276 if (unicode_decode_call_errorhandler(
2277 errors, &errorHandler,
2278 "unicodeescape", "illegal Unicode character",
2279 starts, size, &startinpos, &endinpos, &exc, &s,
2280 (PyObject **)&v, &outpos, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00002281 goto onError;
2282 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00002283 break;
2284
2285 /* \N{name} */
2286 case 'N':
2287 message = "malformed \\N character escape";
2288 if (ucnhash_CAPI == NULL) {
2289 /* load the unicode data module */
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00002290 PyObject *m, *api;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002291 m = PyImport_ImportModule("unicodedata");
2292 if (m == NULL)
2293 goto ucnhashError;
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00002294 api = PyObject_GetAttrString(m, "ucnhash_CAPI");
Fredrik Lundhccc74732001-02-18 22:13:49 +00002295 Py_DECREF(m);
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00002296 if (api == NULL)
Fredrik Lundhccc74732001-02-18 22:13:49 +00002297 goto ucnhashError;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002298 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCObject_AsVoidPtr(api);
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00002299 Py_DECREF(api);
Fredrik Lundhccc74732001-02-18 22:13:49 +00002300 if (ucnhash_CAPI == NULL)
2301 goto ucnhashError;
2302 }
2303 if (*s == '{') {
2304 const char *start = s+1;
2305 /* look for the closing brace */
2306 while (*s != '}' && s < end)
2307 s++;
2308 if (s > start && s < end && *s == '}') {
2309 /* found a name. look it up in the unicode database */
2310 message = "unknown Unicode character name";
2311 s++;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +00002312 if (ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1), &chr))
Fredrik Lundhccc74732001-02-18 22:13:49 +00002313 goto store;
2314 }
2315 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002316 endinpos = s-starts;
2317 outpos = p-PyUnicode_AS_UNICODE(v);
2318 if (unicode_decode_call_errorhandler(
2319 errors, &errorHandler,
2320 "unicodeescape", message,
2321 starts, size, &startinpos, &endinpos, &exc, &s,
2322 (PyObject **)&v, &outpos, &p))
Fredrik Lundhccc74732001-02-18 22:13:49 +00002323 goto onError;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002324 break;
2325
2326 default:
Walter Dörwald8c077222002-03-25 11:16:18 +00002327 if (s > end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002328 message = "\\ at end of string";
2329 s--;
2330 endinpos = s-starts;
2331 outpos = p-PyUnicode_AS_UNICODE(v);
2332 if (unicode_decode_call_errorhandler(
2333 errors, &errorHandler,
2334 "unicodeescape", message,
2335 starts, size, &startinpos, &endinpos, &exc, &s,
2336 (PyObject **)&v, &outpos, &p))
Walter Dörwald8c077222002-03-25 11:16:18 +00002337 goto onError;
2338 }
2339 else {
2340 *p++ = '\\';
2341 *p++ = (unsigned char)s[-1];
2342 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00002343 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002344 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002345 nextByte:
2346 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002347 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002348 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002349 goto onError;
Walter Dörwaldd4ade082003-08-15 15:00:26 +00002350 Py_XDECREF(errorHandler);
2351 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002352 return (PyObject *)v;
Walter Dörwald8c077222002-03-25 11:16:18 +00002353
Fredrik Lundhccc74732001-02-18 22:13:49 +00002354ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00002355 PyErr_SetString(
2356 PyExc_UnicodeError,
2357 "\\N escapes not supported (can't load unicodedata module)"
2358 );
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00002359 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002360 Py_XDECREF(errorHandler);
2361 Py_XDECREF(exc);
Fredrik Lundhf6056062001-01-20 11:15:25 +00002362 return NULL;
2363
Fredrik Lundhccc74732001-02-18 22:13:49 +00002364onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00002365 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002366 Py_XDECREF(errorHandler);
2367 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002368 return NULL;
2369}
2370
2371/* Return a Unicode-Escape string version of the Unicode object.
2372
2373 If quotes is true, the string is enclosed in u"" or u'' quotes as
2374 appropriate.
2375
2376*/
2377
Thomas Wouters477c8d52006-05-27 19:21:47 +00002378Py_LOCAL_INLINE(const Py_UNICODE *) findchar(const Py_UNICODE *s,
2379 Py_ssize_t size,
2380 Py_UNICODE ch)
2381{
2382 /* like wcschr, but doesn't stop at NULL characters */
2383
2384 while (size-- > 0) {
2385 if (*s == ch)
2386 return s;
2387 s++;
2388 }
2389
2390 return NULL;
2391}
Barry Warsaw51ac5802000-03-20 16:36:48 +00002392
Walter Dörwald79e913e2007-05-12 11:08:06 +00002393static const char *hexdigits = "0123456789abcdef";
2394
2395PyObject *PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
2396 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002397{
2398 PyObject *repr;
2399 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002400
Thomas Wouters89f507f2006-12-13 04:49:30 +00002401 /* XXX(nnorwitz): rather than over-allocating, it would be
2402 better to choose a different scheme. Perhaps scan the
2403 first N-chars of the string and allocate based on that size.
2404 */
2405 /* Initial allocation is based on the longest-possible unichr
2406 escape.
2407
2408 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
2409 unichr, so in this case it's the longest unichr escape. In
2410 narrow (UTF-16) builds this is five chars per source unichr
2411 since there are two unichrs in the surrogate pair, so in narrow
2412 (UTF-16) builds it's not the longest unichr escape.
2413
2414 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
2415 so in the narrow (UTF-16) build case it's the longest unichr
2416 escape.
2417 */
2418
Walter Dörwald79e913e2007-05-12 11:08:06 +00002419 repr = PyBytes_FromStringAndSize(NULL,
Thomas Wouters89f507f2006-12-13 04:49:30 +00002420#ifdef Py_UNICODE_WIDE
2421 + 10*size
2422#else
2423 + 6*size
2424#endif
2425 + 1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002426 if (repr == NULL)
2427 return NULL;
2428
Walter Dörwald79e913e2007-05-12 11:08:06 +00002429 p = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002430
Guido van Rossumd57fd912000-03-10 22:53:23 +00002431 while (size-- > 0) {
2432 Py_UNICODE ch = *s++;
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002433
Walter Dörwald79e913e2007-05-12 11:08:06 +00002434 /* Escape backslashes */
2435 if (ch == '\\') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002436 *p++ = '\\';
2437 *p++ = (char) ch;
Walter Dörwald79e913e2007-05-12 11:08:06 +00002438 continue;
Tim Petersced69f82003-09-16 20:30:58 +00002439 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002440
Guido van Rossum0d42e0c2001-07-20 16:36:21 +00002441#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002442 /* Map 21-bit characters to '\U00xxxxxx' */
2443 else if (ch >= 0x10000) {
2444 *p++ = '\\';
2445 *p++ = 'U';
Walter Dörwald79e913e2007-05-12 11:08:06 +00002446 *p++ = hexdigits[(ch >> 28) & 0x0000000F];
2447 *p++ = hexdigits[(ch >> 24) & 0x0000000F];
2448 *p++ = hexdigits[(ch >> 20) & 0x0000000F];
2449 *p++ = hexdigits[(ch >> 16) & 0x0000000F];
2450 *p++ = hexdigits[(ch >> 12) & 0x0000000F];
2451 *p++ = hexdigits[(ch >> 8) & 0x0000000F];
2452 *p++ = hexdigits[(ch >> 4) & 0x0000000F];
2453 *p++ = hexdigits[ch & 0x0000000F];
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002454 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002455 }
Thomas Wouters89f507f2006-12-13 04:49:30 +00002456#else
2457 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002458 else if (ch >= 0xD800 && ch < 0xDC00) {
2459 Py_UNICODE ch2;
2460 Py_UCS4 ucs;
Tim Petersced69f82003-09-16 20:30:58 +00002461
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002462 ch2 = *s++;
2463 size--;
2464 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
2465 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
2466 *p++ = '\\';
2467 *p++ = 'U';
Walter Dörwald79e913e2007-05-12 11:08:06 +00002468 *p++ = hexdigits[(ucs >> 28) & 0x0000000F];
2469 *p++ = hexdigits[(ucs >> 24) & 0x0000000F];
2470 *p++ = hexdigits[(ucs >> 20) & 0x0000000F];
2471 *p++ = hexdigits[(ucs >> 16) & 0x0000000F];
2472 *p++ = hexdigits[(ucs >> 12) & 0x0000000F];
2473 *p++ = hexdigits[(ucs >> 8) & 0x0000000F];
2474 *p++ = hexdigits[(ucs >> 4) & 0x0000000F];
2475 *p++ = hexdigits[ucs & 0x0000000F];
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002476 continue;
2477 }
2478 /* Fall through: isolated surrogates are copied as-is */
2479 s--;
2480 size++;
2481 }
Thomas Wouters89f507f2006-12-13 04:49:30 +00002482#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002483
Guido van Rossumd57fd912000-03-10 22:53:23 +00002484 /* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002485 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002486 *p++ = '\\';
2487 *p++ = 'u';
Walter Dörwald79e913e2007-05-12 11:08:06 +00002488 *p++ = hexdigits[(ch >> 12) & 0x000F];
2489 *p++ = hexdigits[(ch >> 8) & 0x000F];
2490 *p++ = hexdigits[(ch >> 4) & 0x000F];
2491 *p++ = hexdigits[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00002492 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002493
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00002494 /* Map special whitespace to '\t', \n', '\r' */
2495 else if (ch == '\t') {
2496 *p++ = '\\';
2497 *p++ = 't';
2498 }
2499 else if (ch == '\n') {
2500 *p++ = '\\';
2501 *p++ = 'n';
2502 }
2503 else if (ch == '\r') {
2504 *p++ = '\\';
2505 *p++ = 'r';
2506 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002507
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00002508 /* Map non-printable US ASCII to '\xhh' */
Marc-André Lemburg11326de2001-11-28 12:56:20 +00002509 else if (ch < ' ' || ch >= 0x7F) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002510 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00002511 *p++ = 'x';
Walter Dörwald79e913e2007-05-12 11:08:06 +00002512 *p++ = hexdigits[(ch >> 4) & 0x000F];
2513 *p++ = hexdigits[ch & 0x000F];
Tim Petersced69f82003-09-16 20:30:58 +00002514 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002515
Guido van Rossumd57fd912000-03-10 22:53:23 +00002516 /* Copy everything else as-is */
2517 else
2518 *p++ = (char) ch;
2519 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002520
2521 *p = '\0';
Walter Dörwald79e913e2007-05-12 11:08:06 +00002522 if (PyBytes_Resize(repr, p - PyBytes_AS_STRING(repr))) {
2523 Py_DECREF(repr);
2524 return NULL;
2525 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002526 return repr;
2527}
2528
Guido van Rossumd57fd912000-03-10 22:53:23 +00002529PyObject *PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
2530{
Walter Dörwald79e913e2007-05-12 11:08:06 +00002531 PyObject *s, *result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002532 if (!PyUnicode_Check(unicode)) {
2533 PyErr_BadArgument();
2534 return NULL;
2535 }
Walter Dörwald79e913e2007-05-12 11:08:06 +00002536 s = PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
2537 PyUnicode_GET_SIZE(unicode));
2538
2539 if (!s)
2540 return NULL;
2541 result = PyString_FromStringAndSize(PyBytes_AS_STRING(s),
2542 PyBytes_GET_SIZE(s));
2543 Py_DECREF(s);
2544 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002545}
2546
2547/* --- Raw Unicode Escape Codec ------------------------------------------- */
2548
2549PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002550 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00002551 const char *errors)
2552{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002553 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002554 Py_ssize_t startinpos;
2555 Py_ssize_t endinpos;
2556 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002557 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002558 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002559 const char *end;
2560 const char *bs;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002561 PyObject *errorHandler = NULL;
2562 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00002563
Guido van Rossumd57fd912000-03-10 22:53:23 +00002564 /* Escaped strings will always be longer than the resulting
2565 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002566 length after conversion to the true value. (But decoding error
2567 handler might have to resize the string) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002568 v = _PyUnicode_New(size);
2569 if (v == NULL)
2570 goto onError;
2571 if (size == 0)
2572 return (PyObject *)v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002573 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002574 end = s + size;
2575 while (s < end) {
2576 unsigned char c;
Martin v. Löwis047c05e2002-03-21 08:55:28 +00002577 Py_UCS4 x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002578 int i;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002579 int count;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002580
2581 /* Non-escape characters are interpreted as Unicode ordinals */
2582 if (*s != '\\') {
2583 *p++ = (unsigned char)*s++;
2584 continue;
2585 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002586 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002587
2588 /* \u-escapes are only interpreted iff the number of leading
2589 backslashes if odd */
2590 bs = s;
2591 for (;s < end;) {
2592 if (*s != '\\')
2593 break;
2594 *p++ = (unsigned char)*s++;
2595 }
2596 if (((s - bs) & 1) == 0 ||
2597 s >= end ||
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002598 (*s != 'u' && *s != 'U')) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002599 continue;
2600 }
2601 p--;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002602 count = *s=='u' ? 4 : 8;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002603 s++;
2604
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002605 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002606 outpos = p-PyUnicode_AS_UNICODE(v);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002607 for (x = 0, i = 0; i < count; ++i, ++s) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002608 c = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002609 if (!isxdigit(c)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002610 endinpos = s-starts;
2611 if (unicode_decode_call_errorhandler(
2612 errors, &errorHandler,
2613 "rawunicodeescape", "truncated \\uXXXX",
2614 starts, size, &startinpos, &endinpos, &exc, &s,
2615 (PyObject **)&v, &outpos, &p))
Guido van Rossumd57fd912000-03-10 22:53:23 +00002616 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002617 goto nextByte;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002618 }
2619 x = (x<<4) & ~0xF;
2620 if (c >= '0' && c <= '9')
2621 x += c - '0';
2622 else if (c >= 'a' && c <= 'f')
2623 x += 10 + c - 'a';
2624 else
2625 x += 10 + c - 'A';
2626 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002627#ifndef Py_UNICODE_WIDE
2628 if (x > 0x10000) {
2629 if (unicode_decode_call_errorhandler(
2630 errors, &errorHandler,
2631 "rawunicodeescape", "\\Uxxxxxxxx out of range",
2632 starts, size, &startinpos, &endinpos, &exc, &s,
2633 (PyObject **)&v, &outpos, &p))
2634 goto onError;
2635 }
2636#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002637 *p++ = x;
2638 nextByte:
2639 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002640 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002641 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002642 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002643 Py_XDECREF(errorHandler);
2644 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002645 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00002646
Guido van Rossumd57fd912000-03-10 22:53:23 +00002647 onError:
2648 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002649 Py_XDECREF(errorHandler);
2650 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002651 return NULL;
2652}
2653
2654PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002655 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002656{
2657 PyObject *repr;
2658 char *p;
2659 char *q;
2660
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002661#ifdef Py_UNICODE_WIDE
Walter Dörwald711005d2007-05-12 12:03:26 +00002662 repr = PyBytes_FromStringAndSize(NULL, 10 * size);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002663#else
Walter Dörwald711005d2007-05-12 12:03:26 +00002664 repr = PyBytes_FromStringAndSize(NULL, 6 * size);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002665#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00002666 if (repr == NULL)
2667 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00002668 if (size == 0)
2669 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002670
Walter Dörwald711005d2007-05-12 12:03:26 +00002671 p = q = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002672 while (size-- > 0) {
2673 Py_UNICODE ch = *s++;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002674#ifdef Py_UNICODE_WIDE
2675 /* Map 32-bit characters to '\Uxxxxxxxx' */
2676 if (ch >= 0x10000) {
2677 *p++ = '\\';
2678 *p++ = 'U';
Walter Dörwalddb5d33e2007-05-12 11:13:47 +00002679 *p++ = hexdigits[(ch >> 28) & 0xf];
2680 *p++ = hexdigits[(ch >> 24) & 0xf];
2681 *p++ = hexdigits[(ch >> 20) & 0xf];
2682 *p++ = hexdigits[(ch >> 16) & 0xf];
2683 *p++ = hexdigits[(ch >> 12) & 0xf];
2684 *p++ = hexdigits[(ch >> 8) & 0xf];
2685 *p++ = hexdigits[(ch >> 4) & 0xf];
2686 *p++ = hexdigits[ch & 15];
Tim Petersced69f82003-09-16 20:30:58 +00002687 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002688 else
2689#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00002690 /* Map 16-bit characters to '\uxxxx' */
2691 if (ch >= 256) {
2692 *p++ = '\\';
2693 *p++ = 'u';
Walter Dörwalddb5d33e2007-05-12 11:13:47 +00002694 *p++ = hexdigits[(ch >> 12) & 0xf];
2695 *p++ = hexdigits[(ch >> 8) & 0xf];
2696 *p++ = hexdigits[(ch >> 4) & 0xf];
2697 *p++ = hexdigits[ch & 15];
Guido van Rossumd57fd912000-03-10 22:53:23 +00002698 }
2699 /* Copy everything else as-is */
2700 else
2701 *p++ = (char) ch;
2702 }
2703 *p = '\0';
Walter Dörwald711005d2007-05-12 12:03:26 +00002704 if (PyBytes_Resize(repr, p - q)) {
2705 Py_DECREF(repr);
2706 return NULL;
2707 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002708 return repr;
2709}
2710
2711PyObject *PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
2712{
Walter Dörwald711005d2007-05-12 12:03:26 +00002713 PyObject *s, *result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002714 if (!PyUnicode_Check(unicode)) {
Walter Dörwald711005d2007-05-12 12:03:26 +00002715 PyErr_BadArgument();
2716 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002717 }
Walter Dörwald711005d2007-05-12 12:03:26 +00002718 s = PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
2719 PyUnicode_GET_SIZE(unicode));
2720
2721 if (!s)
2722 return NULL;
2723 result = PyString_FromStringAndSize(PyBytes_AS_STRING(s),
2724 PyBytes_GET_SIZE(s));
2725 Py_DECREF(s);
2726 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002727}
2728
Walter Dörwalda47d1c02005-08-30 10:23:14 +00002729/* --- Unicode Internal Codec ------------------------------------------- */
2730
2731PyObject *_PyUnicode_DecodeUnicodeInternal(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002732 Py_ssize_t size,
Walter Dörwalda47d1c02005-08-30 10:23:14 +00002733 const char *errors)
2734{
2735 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002736 Py_ssize_t startinpos;
2737 Py_ssize_t endinpos;
2738 Py_ssize_t outpos;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00002739 PyUnicodeObject *v;
2740 Py_UNICODE *p;
2741 const char *end;
2742 const char *reason;
2743 PyObject *errorHandler = NULL;
2744 PyObject *exc = NULL;
2745
Neal Norwitzd43069c2006-01-08 01:12:10 +00002746#ifdef Py_UNICODE_WIDE
2747 Py_UNICODE unimax = PyUnicode_GetMax();
2748#endif
2749
Thomas Wouters89f507f2006-12-13 04:49:30 +00002750 /* XXX overflow detection missing */
Walter Dörwalda47d1c02005-08-30 10:23:14 +00002751 v = _PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE);
2752 if (v == NULL)
2753 goto onError;
2754 if (PyUnicode_GetSize((PyObject *)v) == 0)
2755 return (PyObject *)v;
2756 p = PyUnicode_AS_UNICODE(v);
2757 end = s + size;
2758
2759 while (s < end) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00002760 memcpy(p, s, sizeof(Py_UNICODE));
Walter Dörwalda47d1c02005-08-30 10:23:14 +00002761 /* We have to sanity check the raw data, otherwise doom looms for
2762 some malformed UCS-4 data. */
2763 if (
2764 #ifdef Py_UNICODE_WIDE
2765 *p > unimax || *p < 0 ||
2766 #endif
2767 end-s < Py_UNICODE_SIZE
2768 )
2769 {
2770 startinpos = s - starts;
2771 if (end-s < Py_UNICODE_SIZE) {
2772 endinpos = end-starts;
2773 reason = "truncated input";
2774 }
2775 else {
2776 endinpos = s - starts + Py_UNICODE_SIZE;
2777 reason = "illegal code point (> 0x10FFFF)";
2778 }
2779 outpos = p - PyUnicode_AS_UNICODE(v);
2780 if (unicode_decode_call_errorhandler(
2781 errors, &errorHandler,
2782 "unicode_internal", reason,
2783 starts, size, &startinpos, &endinpos, &exc, &s,
2784 (PyObject **)&v, &outpos, &p)) {
2785 goto onError;
2786 }
2787 }
2788 else {
2789 p++;
2790 s += Py_UNICODE_SIZE;
2791 }
2792 }
2793
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002794 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00002795 goto onError;
2796 Py_XDECREF(errorHandler);
2797 Py_XDECREF(exc);
2798 return (PyObject *)v;
2799
2800 onError:
2801 Py_XDECREF(v);
2802 Py_XDECREF(errorHandler);
2803 Py_XDECREF(exc);
2804 return NULL;
2805}
2806
Guido van Rossumd57fd912000-03-10 22:53:23 +00002807/* --- Latin-1 Codec ------------------------------------------------------ */
2808
2809PyObject *PyUnicode_DecodeLatin1(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002810 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00002811 const char *errors)
2812{
2813 PyUnicodeObject *v;
2814 Py_UNICODE *p;
Tim Petersced69f82003-09-16 20:30:58 +00002815
Guido van Rossumd57fd912000-03-10 22:53:23 +00002816 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002817 if (size == 1) {
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002818 Py_UNICODE r = *(unsigned char*)s;
2819 return PyUnicode_FromUnicode(&r, 1);
2820 }
2821
Guido van Rossumd57fd912000-03-10 22:53:23 +00002822 v = _PyUnicode_New(size);
2823 if (v == NULL)
2824 goto onError;
2825 if (size == 0)
2826 return (PyObject *)v;
2827 p = PyUnicode_AS_UNICODE(v);
2828 while (size-- > 0)
2829 *p++ = (unsigned char)*s++;
2830 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00002831
Guido van Rossumd57fd912000-03-10 22:53:23 +00002832 onError:
2833 Py_XDECREF(v);
2834 return NULL;
2835}
2836
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002837/* create or adjust a UnicodeEncodeError */
2838static void make_encode_exception(PyObject **exceptionObject,
2839 const char *encoding,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002840 const Py_UNICODE *unicode, Py_ssize_t size,
2841 Py_ssize_t startpos, Py_ssize_t endpos,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002842 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002843{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002844 if (*exceptionObject == NULL) {
2845 *exceptionObject = PyUnicodeEncodeError_Create(
2846 encoding, unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002847 }
2848 else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002849 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
2850 goto onError;
2851 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
2852 goto onError;
2853 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
2854 goto onError;
2855 return;
2856 onError:
2857 Py_DECREF(*exceptionObject);
2858 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002859 }
2860}
2861
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002862/* raises a UnicodeEncodeError */
2863static void raise_encode_exception(PyObject **exceptionObject,
2864 const char *encoding,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002865 const Py_UNICODE *unicode, Py_ssize_t size,
2866 Py_ssize_t startpos, Py_ssize_t endpos,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002867 const char *reason)
2868{
2869 make_encode_exception(exceptionObject,
2870 encoding, unicode, size, startpos, endpos, reason);
2871 if (*exceptionObject != NULL)
2872 PyCodec_StrictErrors(*exceptionObject);
2873}
2874
2875/* error handling callback helper:
2876 build arguments, call the callback and check the arguments,
2877 put the result into newpos and return the replacement string, which
2878 has to be freed by the caller */
2879static PyObject *unicode_encode_call_errorhandler(const char *errors,
2880 PyObject **errorHandler,
2881 const char *encoding, const char *reason,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002882 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
2883 Py_ssize_t startpos, Py_ssize_t endpos,
2884 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002885{
Martin v. Löwis18e16552006-02-15 17:27:45 +00002886 static char *argparse = "O!n;encoding error handler must return (unicode, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002887
2888 PyObject *restuple;
2889 PyObject *resunicode;
2890
2891 if (*errorHandler == NULL) {
2892 *errorHandler = PyCodec_LookupError(errors);
2893 if (*errorHandler == NULL)
2894 return NULL;
2895 }
2896
2897 make_encode_exception(exceptionObject,
2898 encoding, unicode, size, startpos, endpos, reason);
2899 if (*exceptionObject == NULL)
2900 return NULL;
2901
2902 restuple = PyObject_CallFunctionObjArgs(
2903 *errorHandler, *exceptionObject, NULL);
2904 if (restuple == NULL)
2905 return NULL;
2906 if (!PyTuple_Check(restuple)) {
2907 PyErr_Format(PyExc_TypeError, &argparse[4]);
2908 Py_DECREF(restuple);
2909 return NULL;
2910 }
2911 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
2912 &resunicode, newpos)) {
2913 Py_DECREF(restuple);
2914 return NULL;
2915 }
2916 if (*newpos<0)
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00002917 *newpos = size+*newpos;
2918 if (*newpos<0 || *newpos>size) {
Martin v. Löwis2c95cc62006-02-16 06:54:25 +00002919 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00002920 Py_DECREF(restuple);
2921 return NULL;
2922 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002923 Py_INCREF(resunicode);
2924 Py_DECREF(restuple);
2925 return resunicode;
2926}
2927
2928static PyObject *unicode_encode_ucs1(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002929 Py_ssize_t size,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002930 const char *errors,
2931 int limit)
2932{
2933 /* output object */
2934 PyObject *res;
2935 /* pointers to the beginning and end+1 of input */
2936 const Py_UNICODE *startp = p;
2937 const Py_UNICODE *endp = p + size;
2938 /* pointer to the beginning of the unencodable characters */
2939 /* const Py_UNICODE *badp = NULL; */
2940 /* pointer into the output */
2941 char *str;
2942 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00002943 Py_ssize_t respos = 0;
2944 Py_ssize_t ressize;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002945 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
2946 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002947 PyObject *errorHandler = NULL;
2948 PyObject *exc = NULL;
2949 /* the following variable is used for caching string comparisons
2950 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
2951 int known_errorHandler = -1;
2952
2953 /* allocate enough for a simple encoding without
2954 replacements, if we need more, we'll resize */
Guido van Rossumf15a29f2007-05-04 00:41:39 +00002955 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002956 if (res == NULL)
2957 goto onError;
2958 if (size == 0)
2959 return res;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00002960 str = PyBytes_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002961 ressize = size;
2962
2963 while (p<endp) {
2964 Py_UNICODE c = *p;
2965
2966 /* can we encode this? */
2967 if (c<limit) {
2968 /* no overflow check, because we know that the space is enough */
2969 *str++ = (char)c;
2970 ++p;
2971 }
2972 else {
Martin v. Löwis18e16552006-02-15 17:27:45 +00002973 Py_ssize_t unicodepos = p-startp;
2974 Py_ssize_t requiredsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002975 PyObject *repunicode;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002976 Py_ssize_t repsize;
2977 Py_ssize_t newpos;
2978 Py_ssize_t respos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002979 Py_UNICODE *uni2;
2980 /* startpos for collecting unencodable chars */
2981 const Py_UNICODE *collstart = p;
2982 const Py_UNICODE *collend = p;
2983 /* find all unecodable characters */
2984 while ((collend < endp) && ((*collend)>=limit))
2985 ++collend;
2986 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
2987 if (known_errorHandler==-1) {
2988 if ((errors==NULL) || (!strcmp(errors, "strict")))
2989 known_errorHandler = 1;
2990 else if (!strcmp(errors, "replace"))
2991 known_errorHandler = 2;
2992 else if (!strcmp(errors, "ignore"))
2993 known_errorHandler = 3;
2994 else if (!strcmp(errors, "xmlcharrefreplace"))
2995 known_errorHandler = 4;
2996 else
2997 known_errorHandler = 0;
2998 }
2999 switch (known_errorHandler) {
3000 case 1: /* strict */
3001 raise_encode_exception(&exc, encoding, startp, size, collstart-startp, collend-startp, reason);
3002 goto onError;
3003 case 2: /* replace */
3004 while (collstart++<collend)
3005 *str++ = '?'; /* fall through */
3006 case 3: /* ignore */
3007 p = collend;
3008 break;
3009 case 4: /* xmlcharrefreplace */
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003010 respos = str - PyBytes_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003011 /* determine replacement size (temporarily (mis)uses p) */
3012 for (p = collstart, repsize = 0; p < collend; ++p) {
3013 if (*p<10)
3014 repsize += 2+1+1;
3015 else if (*p<100)
3016 repsize += 2+2+1;
3017 else if (*p<1000)
3018 repsize += 2+3+1;
3019 else if (*p<10000)
3020 repsize += 2+4+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00003021#ifndef Py_UNICODE_WIDE
3022 else
3023 repsize += 2+5+1;
3024#else
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003025 else if (*p<100000)
3026 repsize += 2+5+1;
3027 else if (*p<1000000)
3028 repsize += 2+6+1;
3029 else
3030 repsize += 2+7+1;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003031#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003032 }
3033 requiredsize = respos+repsize+(endp-collend);
3034 if (requiredsize > ressize) {
3035 if (requiredsize<2*ressize)
3036 requiredsize = 2*ressize;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003037 if (PyBytes_Resize(res, requiredsize))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003038 goto onError;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003039 str = PyBytes_AS_STRING(res) + respos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003040 ressize = requiredsize;
3041 }
3042 /* generate replacement (temporarily (mis)uses p) */
3043 for (p = collstart; p < collend; ++p) {
3044 str += sprintf(str, "&#%d;", (int)*p);
3045 }
3046 p = collend;
3047 break;
3048 default:
3049 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
3050 encoding, reason, startp, size, &exc,
3051 collstart-startp, collend-startp, &newpos);
3052 if (repunicode == NULL)
3053 goto onError;
3054 /* need more space? (at least enough for what we
3055 have+the replacement+the rest of the string, so
3056 we won't have to check space for encodable characters) */
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003057 respos = str - PyBytes_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003058 repsize = PyUnicode_GET_SIZE(repunicode);
3059 requiredsize = respos+repsize+(endp-collend);
3060 if (requiredsize > ressize) {
3061 if (requiredsize<2*ressize)
3062 requiredsize = 2*ressize;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003063 if (PyBytes_Resize(res, requiredsize)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003064 Py_DECREF(repunicode);
3065 goto onError;
3066 }
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003067 str = PyBytes_AS_STRING(res) + respos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003068 ressize = requiredsize;
3069 }
3070 /* check if there is anything unencodable in the replacement
3071 and copy it to the output */
3072 for (uni2 = PyUnicode_AS_UNICODE(repunicode);repsize-->0; ++uni2, ++str) {
3073 c = *uni2;
3074 if (c >= limit) {
3075 raise_encode_exception(&exc, encoding, startp, size,
3076 unicodepos, unicodepos+1, reason);
3077 Py_DECREF(repunicode);
3078 goto onError;
3079 }
3080 *str = (char)c;
3081 }
3082 p = startp + newpos;
3083 Py_DECREF(repunicode);
3084 }
3085 }
3086 }
3087 /* Resize if we allocated to much */
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003088 respos = str - PyBytes_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003089 if (respos<ressize)
3090 /* If this falls res will be NULL */
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003091 PyBytes_Resize(res, respos);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003092 Py_XDECREF(errorHandler);
3093 Py_XDECREF(exc);
3094 return res;
3095
3096 onError:
3097 Py_XDECREF(res);
3098 Py_XDECREF(errorHandler);
3099 Py_XDECREF(exc);
3100 return NULL;
3101}
3102
Guido van Rossumd57fd912000-03-10 22:53:23 +00003103PyObject *PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003104 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003105 const char *errors)
3106{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003107 return unicode_encode_ucs1(p, size, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003108}
3109
3110PyObject *PyUnicode_AsLatin1String(PyObject *unicode)
3111{
3112 if (!PyUnicode_Check(unicode)) {
3113 PyErr_BadArgument();
3114 return NULL;
3115 }
3116 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
3117 PyUnicode_GET_SIZE(unicode),
3118 NULL);
3119}
3120
3121/* --- 7-bit ASCII Codec -------------------------------------------------- */
3122
Guido van Rossumd57fd912000-03-10 22:53:23 +00003123PyObject *PyUnicode_DecodeASCII(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003124 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003125 const char *errors)
3126{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003127 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003128 PyUnicodeObject *v;
3129 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003130 Py_ssize_t startinpos;
3131 Py_ssize_t endinpos;
3132 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003133 const char *e;
3134 PyObject *errorHandler = NULL;
3135 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00003136
Guido van Rossumd57fd912000-03-10 22:53:23 +00003137 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003138 if (size == 1 && *(unsigned char*)s < 128) {
3139 Py_UNICODE r = *(unsigned char*)s;
3140 return PyUnicode_FromUnicode(&r, 1);
3141 }
Tim Petersced69f82003-09-16 20:30:58 +00003142
Guido van Rossumd57fd912000-03-10 22:53:23 +00003143 v = _PyUnicode_New(size);
3144 if (v == NULL)
3145 goto onError;
3146 if (size == 0)
3147 return (PyObject *)v;
3148 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003149 e = s + size;
3150 while (s < e) {
3151 register unsigned char c = (unsigned char)*s;
3152 if (c < 128) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003153 *p++ = c;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003154 ++s;
3155 }
3156 else {
3157 startinpos = s-starts;
3158 endinpos = startinpos + 1;
Jeremy Hyltond8082792003-09-16 19:41:39 +00003159 outpos = p - (Py_UNICODE *)PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003160 if (unicode_decode_call_errorhandler(
3161 errors, &errorHandler,
3162 "ascii", "ordinal not in range(128)",
3163 starts, size, &startinpos, &endinpos, &exc, &s,
3164 (PyObject **)&v, &outpos, &p))
Guido van Rossumd57fd912000-03-10 22:53:23 +00003165 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003166 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003167 }
Guido van Rossumfd4b9572000-04-10 13:51:10 +00003168 if (p - PyUnicode_AS_UNICODE(v) < PyString_GET_SIZE(v))
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003169 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Guido van Rossumfd4b9572000-04-10 13:51:10 +00003170 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003171 Py_XDECREF(errorHandler);
3172 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003173 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00003174
Guido van Rossumd57fd912000-03-10 22:53:23 +00003175 onError:
3176 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003177 Py_XDECREF(errorHandler);
3178 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003179 return NULL;
3180}
3181
Guido van Rossumd57fd912000-03-10 22:53:23 +00003182PyObject *PyUnicode_EncodeASCII(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003183 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003184 const char *errors)
3185{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003186 return unicode_encode_ucs1(p, size, errors, 128);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003187}
3188
3189PyObject *PyUnicode_AsASCIIString(PyObject *unicode)
3190{
3191 if (!PyUnicode_Check(unicode)) {
3192 PyErr_BadArgument();
3193 return NULL;
3194 }
3195 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
3196 PyUnicode_GET_SIZE(unicode),
3197 NULL);
3198}
3199
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00003200#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Guido van Rossum2ea3e142000-03-31 17:24:09 +00003201
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003202/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00003203
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003204#if SIZEOF_INT < SIZEOF_SSIZE_T
3205#define NEED_RETRY
3206#endif
3207
3208/* XXX This code is limited to "true" double-byte encodings, as
3209 a) it assumes an incomplete character consists of a single byte, and
3210 b) IsDBCSLeadByte (probably) does not work for non-DBCS multi-byte
3211 encodings, see IsDBCSLeadByteEx documentation. */
3212
3213static int is_dbcs_lead_byte(const char *s, int offset)
3214{
3215 const char *curr = s + offset;
3216
3217 if (IsDBCSLeadByte(*curr)) {
3218 const char *prev = CharPrev(s, curr);
3219 return (prev == curr) || !IsDBCSLeadByte(*prev) || (curr - prev == 2);
3220 }
3221 return 0;
3222}
3223
3224/*
3225 * Decode MBCS string into unicode object. If 'final' is set, converts
3226 * trailing lead-byte too. Returns consumed size if succeed, -1 otherwise.
3227 */
3228static int decode_mbcs(PyUnicodeObject **v,
3229 const char *s, /* MBCS string */
3230 int size, /* sizeof MBCS string */
3231 int final)
3232{
3233 Py_UNICODE *p;
3234 Py_ssize_t n = 0;
3235 int usize = 0;
3236
3237 assert(size >= 0);
3238
3239 /* Skip trailing lead-byte unless 'final' is set */
3240 if (!final && size >= 1 && is_dbcs_lead_byte(s, size - 1))
3241 --size;
3242
3243 /* First get the size of the result */
3244 if (size > 0) {
3245 usize = MultiByteToWideChar(CP_ACP, 0, s, size, NULL, 0);
3246 if (usize == 0) {
3247 PyErr_SetFromWindowsErrWithFilename(0, NULL);
3248 return -1;
3249 }
3250 }
3251
3252 if (*v == NULL) {
3253 /* Create unicode object */
3254 *v = _PyUnicode_New(usize);
3255 if (*v == NULL)
3256 return -1;
3257 }
3258 else {
3259 /* Extend unicode object */
3260 n = PyUnicode_GET_SIZE(*v);
3261 if (_PyUnicode_Resize(v, n + usize) < 0)
3262 return -1;
3263 }
3264
3265 /* Do the conversion */
3266 if (size > 0) {
3267 p = PyUnicode_AS_UNICODE(*v) + n;
3268 if (0 == MultiByteToWideChar(CP_ACP, 0, s, size, p, usize)) {
3269 PyErr_SetFromWindowsErrWithFilename(0, NULL);
3270 return -1;
3271 }
3272 }
3273
3274 return size;
3275}
3276
3277PyObject *PyUnicode_DecodeMBCSStateful(const char *s,
3278 Py_ssize_t size,
3279 const char *errors,
3280 Py_ssize_t *consumed)
3281{
3282 PyUnicodeObject *v = NULL;
3283 int done;
3284
3285 if (consumed)
3286 *consumed = 0;
3287
3288#ifdef NEED_RETRY
3289 retry:
3290 if (size > INT_MAX)
3291 done = decode_mbcs(&v, s, INT_MAX, 0);
3292 else
3293#endif
3294 done = decode_mbcs(&v, s, (int)size, !consumed);
3295
3296 if (done < 0) {
3297 Py_XDECREF(v);
3298 return NULL;
3299 }
3300
3301 if (consumed)
3302 *consumed += done;
3303
3304#ifdef NEED_RETRY
3305 if (size > INT_MAX) {
3306 s += done;
3307 size -= done;
3308 goto retry;
3309 }
3310#endif
3311
3312 return (PyObject *)v;
3313}
3314
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003315PyObject *PyUnicode_DecodeMBCS(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003316 Py_ssize_t size,
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003317 const char *errors)
3318{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003319 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
3320}
3321
3322/*
3323 * Convert unicode into string object (MBCS).
3324 * Returns 0 if succeed, -1 otherwise.
3325 */
3326static int encode_mbcs(PyObject **repr,
3327 const Py_UNICODE *p, /* unicode */
3328 int size) /* size of unicode */
3329{
3330 int mbcssize = 0;
3331 Py_ssize_t n = 0;
3332
3333 assert(size >= 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003334
3335 /* First get the size of the result */
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003336 if (size > 0) {
3337 mbcssize = WideCharToMultiByte(CP_ACP, 0, p, size, NULL, 0, NULL, NULL);
3338 if (mbcssize == 0) {
3339 PyErr_SetFromWindowsErrWithFilename(0, NULL);
3340 return -1;
3341 }
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003342 }
3343
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003344 if (*repr == NULL) {
3345 /* Create string object */
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003346 *repr = PyBytes_FromStringAndSize(NULL, mbcssize);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003347 if (*repr == NULL)
3348 return -1;
3349 }
3350 else {
3351 /* Extend string object */
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003352 n = PyBytes_Size(*repr);
3353 if (PyBytes_Resize(*repr, n + mbcssize) < 0)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003354 return -1;
3355 }
3356
3357 /* Do the conversion */
3358 if (size > 0) {
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003359 char *s = PyBytes_AS_STRING(*repr) + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003360 if (0 == WideCharToMultiByte(CP_ACP, 0, p, size, s, mbcssize, NULL, NULL)) {
3361 PyErr_SetFromWindowsErrWithFilename(0, NULL);
3362 return -1;
3363 }
3364 }
3365
3366 return 0;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003367}
3368
3369PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003370 Py_ssize_t size,
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003371 const char *errors)
3372{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003373 PyObject *repr = NULL;
3374 int ret;
Guido van Rossum03e29f12000-05-04 15:52:20 +00003375
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003376#ifdef NEED_RETRY
3377 retry:
3378 if (size > INT_MAX)
3379 ret = encode_mbcs(&repr, p, INT_MAX);
3380 else
3381#endif
3382 ret = encode_mbcs(&repr, p, (int)size);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003383
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003384 if (ret < 0) {
3385 Py_XDECREF(repr);
3386 return NULL;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003387 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003388
3389#ifdef NEED_RETRY
3390 if (size > INT_MAX) {
3391 p += INT_MAX;
3392 size -= INT_MAX;
3393 goto retry;
3394 }
3395#endif
3396
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003397 return repr;
3398}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00003399
Mark Hammond0ccda1e2003-07-01 00:13:27 +00003400PyObject *PyUnicode_AsMBCSString(PyObject *unicode)
3401{
3402 if (!PyUnicode_Check(unicode)) {
3403 PyErr_BadArgument();
3404 return NULL;
3405 }
3406 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
3407 PyUnicode_GET_SIZE(unicode),
3408 NULL);
3409}
3410
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003411#undef NEED_RETRY
3412
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00003413#endif /* MS_WINDOWS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003414
Guido van Rossumd57fd912000-03-10 22:53:23 +00003415/* --- Character Mapping Codec -------------------------------------------- */
3416
Guido van Rossumd57fd912000-03-10 22:53:23 +00003417PyObject *PyUnicode_DecodeCharmap(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003418 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003419 PyObject *mapping,
3420 const char *errors)
3421{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003422 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003423 Py_ssize_t startinpos;
3424 Py_ssize_t endinpos;
3425 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003426 const char *e;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003427 PyUnicodeObject *v;
3428 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003429 Py_ssize_t extrachars = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003430 PyObject *errorHandler = NULL;
3431 PyObject *exc = NULL;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003432 Py_UNICODE *mapstring = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003433 Py_ssize_t maplen = 0;
Tim Petersced69f82003-09-16 20:30:58 +00003434
Guido van Rossumd57fd912000-03-10 22:53:23 +00003435 /* Default to Latin-1 */
3436 if (mapping == NULL)
3437 return PyUnicode_DecodeLatin1(s, size, errors);
3438
3439 v = _PyUnicode_New(size);
3440 if (v == NULL)
3441 goto onError;
3442 if (size == 0)
3443 return (PyObject *)v;
3444 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003445 e = s + size;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003446 if (PyUnicode_CheckExact(mapping)) {
3447 mapstring = PyUnicode_AS_UNICODE(mapping);
3448 maplen = PyUnicode_GET_SIZE(mapping);
3449 while (s < e) {
3450 unsigned char ch = *s;
3451 Py_UNICODE x = 0xfffe; /* illegal value */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003452
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003453 if (ch < maplen)
3454 x = mapstring[ch];
Guido van Rossumd57fd912000-03-10 22:53:23 +00003455
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003456 if (x == 0xfffe) {
3457 /* undefined mapping */
3458 outpos = p-PyUnicode_AS_UNICODE(v);
3459 startinpos = s-starts;
3460 endinpos = startinpos+1;
3461 if (unicode_decode_call_errorhandler(
3462 errors, &errorHandler,
3463 "charmap", "character maps to <undefined>",
3464 starts, size, &startinpos, &endinpos, &exc, &s,
3465 (PyObject **)&v, &outpos, &p)) {
3466 goto onError;
Marc-André Lemburgec233e52001-01-06 14:59:58 +00003467 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003468 continue;
Marc-André Lemburgec233e52001-01-06 14:59:58 +00003469 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003470 *p++ = x;
3471 ++s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003472 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003473 }
3474 else {
3475 while (s < e) {
3476 unsigned char ch = *s;
3477 PyObject *w, *x;
3478
3479 /* Get mapping (char ordinal -> integer, Unicode char or None) */
3480 w = PyInt_FromLong((long)ch);
3481 if (w == NULL)
3482 goto onError;
3483 x = PyObject_GetItem(mapping, w);
3484 Py_DECREF(w);
3485 if (x == NULL) {
3486 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
3487 /* No mapping found means: mapping is undefined. */
3488 PyErr_Clear();
3489 x = Py_None;
3490 Py_INCREF(x);
3491 } else
3492 goto onError;
3493 }
3494
3495 /* Apply mapping */
3496 if (PyInt_Check(x)) {
3497 long value = PyInt_AS_LONG(x);
3498 if (value < 0 || value > 65535) {
3499 PyErr_SetString(PyExc_TypeError,
3500 "character mapping must be in range(65536)");
3501 Py_DECREF(x);
3502 goto onError;
3503 }
3504 *p++ = (Py_UNICODE)value;
3505 }
3506 else if (x == Py_None) {
3507 /* undefined mapping */
3508 outpos = p-PyUnicode_AS_UNICODE(v);
3509 startinpos = s-starts;
3510 endinpos = startinpos+1;
3511 if (unicode_decode_call_errorhandler(
3512 errors, &errorHandler,
3513 "charmap", "character maps to <undefined>",
3514 starts, size, &startinpos, &endinpos, &exc, &s,
3515 (PyObject **)&v, &outpos, &p)) {
3516 Py_DECREF(x);
3517 goto onError;
3518 }
Walter Dörwaldd4fff172005-11-28 22:15:56 +00003519 Py_DECREF(x);
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003520 continue;
3521 }
3522 else if (PyUnicode_Check(x)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00003523 Py_ssize_t targetsize = PyUnicode_GET_SIZE(x);
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003524
3525 if (targetsize == 1)
3526 /* 1-1 mapping */
3527 *p++ = *PyUnicode_AS_UNICODE(x);
3528
3529 else if (targetsize > 1) {
3530 /* 1-n mapping */
3531 if (targetsize > extrachars) {
3532 /* resize first */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003533 Py_ssize_t oldpos = p - PyUnicode_AS_UNICODE(v);
3534 Py_ssize_t needed = (targetsize - extrachars) + \
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003535 (targetsize << 2);
3536 extrachars += needed;
Thomas Wouters89f507f2006-12-13 04:49:30 +00003537 /* XXX overflow detection missing */
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003538 if (_PyUnicode_Resize(&v,
3539 PyUnicode_GET_SIZE(v) + needed) < 0) {
3540 Py_DECREF(x);
3541 goto onError;
3542 }
3543 p = PyUnicode_AS_UNICODE(v) + oldpos;
3544 }
3545 Py_UNICODE_COPY(p,
3546 PyUnicode_AS_UNICODE(x),
3547 targetsize);
3548 p += targetsize;
3549 extrachars -= targetsize;
3550 }
3551 /* 1-0 mapping: skip the character */
3552 }
3553 else {
3554 /* wrong return value */
3555 PyErr_SetString(PyExc_TypeError,
3556 "character mapping must return integer, None or unicode");
3557 Py_DECREF(x);
3558 goto onError;
3559 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003560 Py_DECREF(x);
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003561 ++s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003562 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003563 }
3564 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003565 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003566 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003567 Py_XDECREF(errorHandler);
3568 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003569 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00003570
Guido van Rossumd57fd912000-03-10 22:53:23 +00003571 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003572 Py_XDECREF(errorHandler);
3573 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003574 Py_XDECREF(v);
3575 return NULL;
3576}
3577
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003578/* Charmap encoding: the lookup table */
3579
3580struct encoding_map{
3581 PyObject_HEAD
3582 unsigned char level1[32];
3583 int count2, count3;
3584 unsigned char level23[1];
3585};
3586
3587static PyObject*
3588encoding_map_size(PyObject *obj, PyObject* args)
3589{
3590 struct encoding_map *map = (struct encoding_map*)obj;
3591 return PyInt_FromLong(sizeof(*map) - 1 + 16*map->count2 +
3592 128*map->count3);
3593}
3594
3595static PyMethodDef encoding_map_methods[] = {
3596 {"size", encoding_map_size, METH_NOARGS,
3597 PyDoc_STR("Return the size (in bytes) of this object") },
3598 { 0 }
3599};
3600
3601static void
3602encoding_map_dealloc(PyObject* o)
3603{
3604 PyObject_FREE(o);
3605}
3606
3607static PyTypeObject EncodingMapType = {
3608 PyObject_HEAD_INIT(NULL)
3609 0, /*ob_size*/
3610 "EncodingMap", /*tp_name*/
3611 sizeof(struct encoding_map), /*tp_basicsize*/
3612 0, /*tp_itemsize*/
3613 /* methods */
3614 encoding_map_dealloc, /*tp_dealloc*/
3615 0, /*tp_print*/
3616 0, /*tp_getattr*/
3617 0, /*tp_setattr*/
3618 0, /*tp_compare*/
3619 0, /*tp_repr*/
3620 0, /*tp_as_number*/
3621 0, /*tp_as_sequence*/
3622 0, /*tp_as_mapping*/
3623 0, /*tp_hash*/
3624 0, /*tp_call*/
3625 0, /*tp_str*/
3626 0, /*tp_getattro*/
3627 0, /*tp_setattro*/
3628 0, /*tp_as_buffer*/
3629 Py_TPFLAGS_DEFAULT, /*tp_flags*/
3630 0, /*tp_doc*/
3631 0, /*tp_traverse*/
3632 0, /*tp_clear*/
3633 0, /*tp_richcompare*/
3634 0, /*tp_weaklistoffset*/
3635 0, /*tp_iter*/
3636 0, /*tp_iternext*/
3637 encoding_map_methods, /*tp_methods*/
3638 0, /*tp_members*/
3639 0, /*tp_getset*/
3640 0, /*tp_base*/
3641 0, /*tp_dict*/
3642 0, /*tp_descr_get*/
3643 0, /*tp_descr_set*/
3644 0, /*tp_dictoffset*/
3645 0, /*tp_init*/
3646 0, /*tp_alloc*/
3647 0, /*tp_new*/
3648 0, /*tp_free*/
3649 0, /*tp_is_gc*/
3650};
3651
3652PyObject*
3653PyUnicode_BuildEncodingMap(PyObject* string)
3654{
3655 Py_UNICODE *decode;
3656 PyObject *result;
3657 struct encoding_map *mresult;
3658 int i;
3659 int need_dict = 0;
3660 unsigned char level1[32];
3661 unsigned char level2[512];
3662 unsigned char *mlevel1, *mlevel2, *mlevel3;
3663 int count2 = 0, count3 = 0;
3664
3665 if (!PyUnicode_Check(string) || PyUnicode_GetSize(string) != 256) {
3666 PyErr_BadArgument();
3667 return NULL;
3668 }
3669 decode = PyUnicode_AS_UNICODE(string);
3670 memset(level1, 0xFF, sizeof level1);
3671 memset(level2, 0xFF, sizeof level2);
3672
3673 /* If there isn't a one-to-one mapping of NULL to \0,
3674 or if there are non-BMP characters, we need to use
3675 a mapping dictionary. */
3676 if (decode[0] != 0)
3677 need_dict = 1;
3678 for (i = 1; i < 256; i++) {
3679 int l1, l2;
3680 if (decode[i] == 0
3681 #ifdef Py_UNICODE_WIDE
3682 || decode[i] > 0xFFFF
3683 #endif
3684 ) {
3685 need_dict = 1;
3686 break;
3687 }
3688 if (decode[i] == 0xFFFE)
3689 /* unmapped character */
3690 continue;
3691 l1 = decode[i] >> 11;
3692 l2 = decode[i] >> 7;
3693 if (level1[l1] == 0xFF)
3694 level1[l1] = count2++;
3695 if (level2[l2] == 0xFF)
3696 level2[l2] = count3++;
3697 }
3698
3699 if (count2 >= 0xFF || count3 >= 0xFF)
3700 need_dict = 1;
3701
3702 if (need_dict) {
3703 PyObject *result = PyDict_New();
3704 PyObject *key, *value;
3705 if (!result)
3706 return NULL;
3707 for (i = 0; i < 256; i++) {
3708 key = value = NULL;
3709 key = PyInt_FromLong(decode[i]);
3710 value = PyInt_FromLong(i);
3711 if (!key || !value)
3712 goto failed1;
3713 if (PyDict_SetItem(result, key, value) == -1)
3714 goto failed1;
3715 Py_DECREF(key);
3716 Py_DECREF(value);
3717 }
3718 return result;
3719 failed1:
3720 Py_XDECREF(key);
3721 Py_XDECREF(value);
3722 Py_DECREF(result);
3723 return NULL;
3724 }
3725
3726 /* Create a three-level trie */
3727 result = PyObject_MALLOC(sizeof(struct encoding_map) +
3728 16*count2 + 128*count3 - 1);
3729 if (!result)
3730 return PyErr_NoMemory();
3731 PyObject_Init(result, &EncodingMapType);
3732 mresult = (struct encoding_map*)result;
3733 mresult->count2 = count2;
3734 mresult->count3 = count3;
3735 mlevel1 = mresult->level1;
3736 mlevel2 = mresult->level23;
3737 mlevel3 = mresult->level23 + 16*count2;
3738 memcpy(mlevel1, level1, 32);
3739 memset(mlevel2, 0xFF, 16*count2);
3740 memset(mlevel3, 0, 128*count3);
3741 count3 = 0;
3742 for (i = 1; i < 256; i++) {
3743 int o1, o2, o3, i2, i3;
3744 if (decode[i] == 0xFFFE)
3745 /* unmapped character */
3746 continue;
3747 o1 = decode[i]>>11;
3748 o2 = (decode[i]>>7) & 0xF;
3749 i2 = 16*mlevel1[o1] + o2;
3750 if (mlevel2[i2] == 0xFF)
3751 mlevel2[i2] = count3++;
3752 o3 = decode[i] & 0x7F;
3753 i3 = 128*mlevel2[i2] + o3;
3754 mlevel3[i3] = i;
3755 }
3756 return result;
3757}
3758
3759static int
3760encoding_map_lookup(Py_UNICODE c, PyObject *mapping)
3761{
3762 struct encoding_map *map = (struct encoding_map*)mapping;
3763 int l1 = c>>11;
3764 int l2 = (c>>7) & 0xF;
3765 int l3 = c & 0x7F;
3766 int i;
3767
3768#ifdef Py_UNICODE_WIDE
3769 if (c > 0xFFFF) {
3770 return -1;
3771 }
3772#endif
3773 if (c == 0)
3774 return 0;
3775 /* level 1*/
3776 i = map->level1[l1];
3777 if (i == 0xFF) {
3778 return -1;
3779 }
3780 /* level 2*/
3781 i = map->level23[16*i+l2];
3782 if (i == 0xFF) {
3783 return -1;
3784 }
3785 /* level 3 */
3786 i = map->level23[16*map->count2 + 128*i + l3];
3787 if (i == 0) {
3788 return -1;
3789 }
3790 return i;
3791}
3792
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003793/* Lookup the character ch in the mapping. If the character
3794 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00003795 error occurred). */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003796static PyObject *charmapencode_lookup(Py_UNICODE c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003797{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003798 PyObject *w = PyInt_FromLong((long)c);
3799 PyObject *x;
3800
3801 if (w == NULL)
3802 return NULL;
3803 x = PyObject_GetItem(mapping, w);
3804 Py_DECREF(w);
3805 if (x == NULL) {
3806 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
3807 /* No mapping found means: mapping is undefined. */
3808 PyErr_Clear();
3809 x = Py_None;
3810 Py_INCREF(x);
3811 return x;
3812 } else
3813 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003814 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00003815 else if (x == Py_None)
3816 return x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003817 else if (PyInt_Check(x)) {
3818 long value = PyInt_AS_LONG(x);
3819 if (value < 0 || value > 255) {
3820 PyErr_SetString(PyExc_TypeError,
3821 "character mapping must be in range(256)");
3822 Py_DECREF(x);
3823 return NULL;
3824 }
3825 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003826 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003827 else if (PyString_Check(x))
3828 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003829 else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003830 /* wrong return value */
Walter Dörwald580ceed2007-05-09 10:39:19 +00003831 PyErr_Format(PyExc_TypeError,
3832 "character mapping must return integer, None or str8, not %.400s",
3833 x->ob_type->tp_name);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003834 Py_DECREF(x);
3835 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003836 }
3837}
3838
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003839static int
Walter Dörwald827b0552007-05-12 13:23:53 +00003840charmapencode_resize(PyObject *outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003841{
Walter Dörwald827b0552007-05-12 13:23:53 +00003842 Py_ssize_t outsize = PyBytes_GET_SIZE( outobj);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003843 /* exponentially overallocate to minimize reallocations */
3844 if (requiredsize < 2*outsize)
3845 requiredsize = 2*outsize;
Walter Dörwald827b0552007-05-12 13:23:53 +00003846 if (PyBytes_Resize(outobj, requiredsize)) {
3847 Py_DECREF(outobj);
3848 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003849 }
Walter Dörwald827b0552007-05-12 13:23:53 +00003850 return 0;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003851}
3852
3853typedef enum charmapencode_result {
3854 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
3855}charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003856/* lookup the character, put the result in the output string and adjust
Walter Dörwald827b0552007-05-12 13:23:53 +00003857 various state variables. Resize the output bytes object if not enough
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003858 space is available. Return a new reference to the object that
3859 was put in the output buffer, or Py_None, if the mapping was undefined
3860 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00003861 reallocation error occurred. The caller must decref the result */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003862static
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003863charmapencode_result charmapencode_output(Py_UNICODE c, PyObject *mapping,
Walter Dörwald827b0552007-05-12 13:23:53 +00003864 PyObject *outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003865{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003866 PyObject *rep;
3867 char *outstart;
Walter Dörwald827b0552007-05-12 13:23:53 +00003868 Py_ssize_t outsize = PyBytes_GET_SIZE(outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003869
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003870 if (mapping->ob_type == &EncodingMapType) {
3871 int res = encoding_map_lookup(c, mapping);
3872 Py_ssize_t requiredsize = *outpos+1;
3873 if (res == -1)
3874 return enc_FAILED;
3875 if (outsize<requiredsize)
Walter Dörwald827b0552007-05-12 13:23:53 +00003876 if (charmapencode_resize(outobj, outpos, requiredsize))
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003877 return enc_EXCEPTION;
Walter Dörwald827b0552007-05-12 13:23:53 +00003878 outstart = PyBytes_AS_STRING(outobj);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003879 outstart[(*outpos)++] = (char)res;
3880 return enc_SUCCESS;
3881 }
3882
3883 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003884 if (rep==NULL)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003885 return enc_EXCEPTION;
3886 else if (rep==Py_None) {
3887 Py_DECREF(rep);
3888 return enc_FAILED;
3889 } else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003890 if (PyInt_Check(rep)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00003891 Py_ssize_t requiredsize = *outpos+1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003892 if (outsize<requiredsize)
Walter Dörwald827b0552007-05-12 13:23:53 +00003893 if (charmapencode_resize(outobj, outpos, requiredsize)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003894 Py_DECREF(rep);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003895 return enc_EXCEPTION;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003896 }
Walter Dörwald827b0552007-05-12 13:23:53 +00003897 outstart = PyBytes_AS_STRING(outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003898 outstart[(*outpos)++] = (char)PyInt_AS_LONG(rep);
3899 }
3900 else {
3901 const char *repchars = PyString_AS_STRING(rep);
Martin v. Löwis18e16552006-02-15 17:27:45 +00003902 Py_ssize_t repsize = PyString_GET_SIZE(rep);
3903 Py_ssize_t requiredsize = *outpos+repsize;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003904 if (outsize<requiredsize)
Walter Dörwald827b0552007-05-12 13:23:53 +00003905 if (charmapencode_resize(outobj, outpos, requiredsize)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003906 Py_DECREF(rep);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003907 return enc_EXCEPTION;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003908 }
Walter Dörwald827b0552007-05-12 13:23:53 +00003909 outstart = PyBytes_AS_STRING(outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003910 memcpy(outstart + *outpos, repchars, repsize);
3911 *outpos += repsize;
3912 }
3913 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003914 Py_DECREF(rep);
3915 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003916}
3917
3918/* handle an error in PyUnicode_EncodeCharmap
3919 Return 0 on success, -1 on error */
3920static
3921int charmap_encoding_error(
Martin v. Löwis18e16552006-02-15 17:27:45 +00003922 const Py_UNICODE *p, Py_ssize_t size, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003923 PyObject **exceptionObject,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00003924 int *known_errorHandler, PyObject **errorHandler, const char *errors,
Walter Dörwald827b0552007-05-12 13:23:53 +00003925 PyObject *res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003926{
3927 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003928 Py_ssize_t repsize;
3929 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003930 Py_UNICODE *uni2;
3931 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003932 Py_ssize_t collstartpos = *inpos;
3933 Py_ssize_t collendpos = *inpos+1;
3934 Py_ssize_t collpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003935 char *encoding = "charmap";
3936 char *reason = "character maps to <undefined>";
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003937 charmapencode_result x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003938
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003939 /* find all unencodable characters */
3940 while (collendpos < size) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003941 PyObject *rep;
3942 if (mapping->ob_type == &EncodingMapType) {
3943 int res = encoding_map_lookup(p[collendpos], mapping);
3944 if (res != -1)
3945 break;
3946 ++collendpos;
3947 continue;
3948 }
3949
3950 rep = charmapencode_lookup(p[collendpos], mapping);
3951 if (rep==NULL)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003952 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003953 else if (rep!=Py_None) {
3954 Py_DECREF(rep);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003955 break;
3956 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003957 Py_DECREF(rep);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003958 ++collendpos;
3959 }
3960 /* cache callback name lookup
3961 * (if not done yet, i.e. it's the first error) */
3962 if (*known_errorHandler==-1) {
3963 if ((errors==NULL) || (!strcmp(errors, "strict")))
3964 *known_errorHandler = 1;
3965 else if (!strcmp(errors, "replace"))
3966 *known_errorHandler = 2;
3967 else if (!strcmp(errors, "ignore"))
3968 *known_errorHandler = 3;
3969 else if (!strcmp(errors, "xmlcharrefreplace"))
3970 *known_errorHandler = 4;
3971 else
3972 *known_errorHandler = 0;
3973 }
3974 switch (*known_errorHandler) {
3975 case 1: /* strict */
3976 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
3977 return -1;
3978 case 2: /* replace */
3979 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
3980 x = charmapencode_output('?', mapping, res, respos);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003981 if (x==enc_EXCEPTION) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003982 return -1;
3983 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003984 else if (x==enc_FAILED) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003985 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
3986 return -1;
3987 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003988 }
3989 /* fall through */
3990 case 3: /* ignore */
3991 *inpos = collendpos;
3992 break;
3993 case 4: /* xmlcharrefreplace */
3994 /* generate replacement (temporarily (mis)uses p) */
3995 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
3996 char buffer[2+29+1+1];
3997 char *cp;
3998 sprintf(buffer, "&#%d;", (int)p[collpos]);
3999 for (cp = buffer; *cp; ++cp) {
4000 x = charmapencode_output(*cp, mapping, res, respos);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004001 if (x==enc_EXCEPTION)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004002 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004003 else if (x==enc_FAILED) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004004 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4005 return -1;
4006 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004007 }
4008 }
4009 *inpos = collendpos;
4010 break;
4011 default:
Walter Dörwalde5402fb2003-08-14 20:25:29 +00004012 repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004013 encoding, reason, p, size, exceptionObject,
4014 collstartpos, collendpos, &newpos);
4015 if (repunicode == NULL)
4016 return -1;
4017 /* generate replacement */
4018 repsize = PyUnicode_GET_SIZE(repunicode);
4019 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
4020 x = charmapencode_output(*uni2, mapping, res, respos);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004021 if (x==enc_EXCEPTION) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004022 return -1;
4023 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004024 else if (x==enc_FAILED) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004025 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004026 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4027 return -1;
4028 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004029 }
4030 *inpos = newpos;
4031 Py_DECREF(repunicode);
4032 }
4033 return 0;
4034}
4035
Guido van Rossumd57fd912000-03-10 22:53:23 +00004036PyObject *PyUnicode_EncodeCharmap(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004037 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004038 PyObject *mapping,
4039 const char *errors)
4040{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004041 /* output object */
4042 PyObject *res = NULL;
4043 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004044 Py_ssize_t inpos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004045 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004046 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004047 PyObject *errorHandler = NULL;
4048 PyObject *exc = NULL;
4049 /* the following variable is used for caching string comparisons
4050 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
4051 * 3=ignore, 4=xmlcharrefreplace */
4052 int known_errorHandler = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004053
4054 /* Default to Latin-1 */
4055 if (mapping == NULL)
4056 return PyUnicode_EncodeLatin1(p, size, errors);
4057
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004058 /* allocate enough for a simple encoding without
4059 replacements, if we need more, we'll resize */
Walter Dörwald827b0552007-05-12 13:23:53 +00004060 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004061 if (res == NULL)
4062 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00004063 if (size == 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004064 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004065
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004066 while (inpos<size) {
4067 /* try to encode it */
Walter Dörwald827b0552007-05-12 13:23:53 +00004068 charmapencode_result x = charmapencode_output(p[inpos], mapping, res, &respos);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004069 if (x==enc_EXCEPTION) /* error */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004070 goto onError;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004071 if (x==enc_FAILED) { /* unencodable character */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004072 if (charmap_encoding_error(p, size, &inpos, mapping,
4073 &exc,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00004074 &known_errorHandler, &errorHandler, errors,
Walter Dörwald827b0552007-05-12 13:23:53 +00004075 res, &respos)) {
Marc-André Lemburg3a645e42001-01-16 11:54:12 +00004076 goto onError;
Walter Dörwald9b30f202003-08-15 16:26:34 +00004077 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004078 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004079 else
4080 /* done with this character => adjust input position */
4081 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004082 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004083
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004084 /* Resize if we allocated to much */
Walter Dörwald827b0552007-05-12 13:23:53 +00004085 if (respos<PyBytes_GET_SIZE(res)) {
4086 if (PyBytes_Resize(res, respos))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004087 goto onError;
4088 }
4089 Py_XDECREF(exc);
4090 Py_XDECREF(errorHandler);
4091 return res;
4092
4093 onError:
4094 Py_XDECREF(res);
4095 Py_XDECREF(exc);
4096 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004097 return NULL;
4098}
4099
4100PyObject *PyUnicode_AsCharmapString(PyObject *unicode,
4101 PyObject *mapping)
4102{
4103 if (!PyUnicode_Check(unicode) || mapping == NULL) {
4104 PyErr_BadArgument();
4105 return NULL;
4106 }
4107 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
4108 PyUnicode_GET_SIZE(unicode),
4109 mapping,
4110 NULL);
4111}
4112
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004113/* create or adjust a UnicodeTranslateError */
4114static void make_translate_exception(PyObject **exceptionObject,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004115 const Py_UNICODE *unicode, Py_ssize_t size,
4116 Py_ssize_t startpos, Py_ssize_t endpos,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004117 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004118{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004119 if (*exceptionObject == NULL) {
4120 *exceptionObject = PyUnicodeTranslateError_Create(
4121 unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004122 }
4123 else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004124 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
4125 goto onError;
4126 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
4127 goto onError;
4128 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
4129 goto onError;
4130 return;
4131 onError:
4132 Py_DECREF(*exceptionObject);
4133 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004134 }
4135}
4136
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004137/* raises a UnicodeTranslateError */
4138static void raise_translate_exception(PyObject **exceptionObject,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004139 const Py_UNICODE *unicode, Py_ssize_t size,
4140 Py_ssize_t startpos, Py_ssize_t endpos,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004141 const char *reason)
4142{
4143 make_translate_exception(exceptionObject,
4144 unicode, size, startpos, endpos, reason);
4145 if (*exceptionObject != NULL)
4146 PyCodec_StrictErrors(*exceptionObject);
4147}
4148
4149/* error handling callback helper:
4150 build arguments, call the callback and check the arguments,
4151 put the result into newpos and return the replacement string, which
4152 has to be freed by the caller */
4153static PyObject *unicode_translate_call_errorhandler(const char *errors,
4154 PyObject **errorHandler,
4155 const char *reason,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004156 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
4157 Py_ssize_t startpos, Py_ssize_t endpos,
4158 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004159{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004160 static char *argparse = "O!n;translating error handler must return (unicode, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004161
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004162 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004163 PyObject *restuple;
4164 PyObject *resunicode;
4165
4166 if (*errorHandler == NULL) {
4167 *errorHandler = PyCodec_LookupError(errors);
4168 if (*errorHandler == NULL)
4169 return NULL;
4170 }
4171
4172 make_translate_exception(exceptionObject,
4173 unicode, size, startpos, endpos, reason);
4174 if (*exceptionObject == NULL)
4175 return NULL;
4176
4177 restuple = PyObject_CallFunctionObjArgs(
4178 *errorHandler, *exceptionObject, NULL);
4179 if (restuple == NULL)
4180 return NULL;
4181 if (!PyTuple_Check(restuple)) {
4182 PyErr_Format(PyExc_TypeError, &argparse[4]);
4183 Py_DECREF(restuple);
4184 return NULL;
4185 }
4186 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004187 &resunicode, &i_newpos)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004188 Py_DECREF(restuple);
4189 return NULL;
4190 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00004191 if (i_newpos<0)
4192 *newpos = size+i_newpos;
4193 else
4194 *newpos = i_newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004195 if (*newpos<0 || *newpos>size) {
Martin v. Löwis2c95cc62006-02-16 06:54:25 +00004196 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004197 Py_DECREF(restuple);
4198 return NULL;
4199 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004200 Py_INCREF(resunicode);
4201 Py_DECREF(restuple);
4202 return resunicode;
4203}
4204
4205/* Lookup the character ch in the mapping and put the result in result,
4206 which must be decrefed by the caller.
4207 Return 0 on success, -1 on error */
4208static
4209int charmaptranslate_lookup(Py_UNICODE c, PyObject *mapping, PyObject **result)
4210{
4211 PyObject *w = PyInt_FromLong((long)c);
4212 PyObject *x;
4213
4214 if (w == NULL)
4215 return -1;
4216 x = PyObject_GetItem(mapping, w);
4217 Py_DECREF(w);
4218 if (x == NULL) {
4219 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
4220 /* No mapping found means: use 1:1 mapping. */
4221 PyErr_Clear();
4222 *result = NULL;
4223 return 0;
4224 } else
4225 return -1;
4226 }
4227 else if (x == Py_None) {
4228 *result = x;
4229 return 0;
4230 }
4231 else if (PyInt_Check(x)) {
4232 long value = PyInt_AS_LONG(x);
4233 long max = PyUnicode_GetMax();
4234 if (value < 0 || value > max) {
4235 PyErr_Format(PyExc_TypeError,
4236 "character mapping must be in range(0x%lx)", max+1);
4237 Py_DECREF(x);
4238 return -1;
4239 }
4240 *result = x;
4241 return 0;
4242 }
4243 else if (PyUnicode_Check(x)) {
4244 *result = x;
4245 return 0;
4246 }
4247 else {
4248 /* wrong return value */
4249 PyErr_SetString(PyExc_TypeError,
4250 "character mapping must return integer, None or unicode");
Walter Dörwald150523e2003-08-15 16:52:19 +00004251 Py_DECREF(x);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004252 return -1;
4253 }
4254}
4255/* ensure that *outobj is at least requiredsize characters long,
4256if not reallocate and adjust various state variables.
4257Return 0 on success, -1 on error */
4258static
Walter Dörwald4894c302003-10-24 14:25:28 +00004259int charmaptranslate_makespace(PyObject **outobj, Py_UNICODE **outp,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004260 Py_ssize_t requiredsize)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004261{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004262 Py_ssize_t oldsize = PyUnicode_GET_SIZE(*outobj);
Walter Dörwald4894c302003-10-24 14:25:28 +00004263 if (requiredsize > oldsize) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004264 /* remember old output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004265 Py_ssize_t outpos = *outp-PyUnicode_AS_UNICODE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004266 /* exponentially overallocate to minimize reallocations */
Walter Dörwald4894c302003-10-24 14:25:28 +00004267 if (requiredsize < 2 * oldsize)
4268 requiredsize = 2 * oldsize;
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00004269 if (_PyUnicode_Resize(outobj, requiredsize) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004270 return -1;
4271 *outp = PyUnicode_AS_UNICODE(*outobj) + outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004272 }
4273 return 0;
4274}
4275/* lookup the character, put the result in the output string and adjust
4276 various state variables. Return a new reference to the object that
4277 was put in the output buffer in *result, or Py_None, if the mapping was
4278 undefined (in which case no character was written).
4279 The called must decref result.
4280 Return 0 on success, -1 on error. */
4281static
Walter Dörwald4894c302003-10-24 14:25:28 +00004282int charmaptranslate_output(const Py_UNICODE *startinp, const Py_UNICODE *curinp,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004283 Py_ssize_t insize, PyObject *mapping, PyObject **outobj, Py_UNICODE **outp,
Walter Dörwald4894c302003-10-24 14:25:28 +00004284 PyObject **res)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004285{
Walter Dörwald4894c302003-10-24 14:25:28 +00004286 if (charmaptranslate_lookup(*curinp, mapping, res))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004287 return -1;
4288 if (*res==NULL) {
4289 /* not found => default to 1:1 mapping */
Walter Dörwald4894c302003-10-24 14:25:28 +00004290 *(*outp)++ = *curinp;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004291 }
4292 else if (*res==Py_None)
4293 ;
4294 else if (PyInt_Check(*res)) {
4295 /* no overflow check, because we know that the space is enough */
4296 *(*outp)++ = (Py_UNICODE)PyInt_AS_LONG(*res);
4297 }
4298 else if (PyUnicode_Check(*res)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00004299 Py_ssize_t repsize = PyUnicode_GET_SIZE(*res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004300 if (repsize==1) {
4301 /* no overflow check, because we know that the space is enough */
4302 *(*outp)++ = *PyUnicode_AS_UNICODE(*res);
4303 }
4304 else if (repsize!=0) {
4305 /* more than one character */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004306 Py_ssize_t requiredsize = (*outp-PyUnicode_AS_UNICODE(*outobj)) +
Walter Dörwaldcd736e72004-02-05 17:36:00 +00004307 (insize - (curinp-startinp)) +
Walter Dörwald4894c302003-10-24 14:25:28 +00004308 repsize - 1;
4309 if (charmaptranslate_makespace(outobj, outp, requiredsize))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004310 return -1;
4311 memcpy(*outp, PyUnicode_AS_UNICODE(*res), sizeof(Py_UNICODE)*repsize);
4312 *outp += repsize;
4313 }
4314 }
4315 else
4316 return -1;
4317 return 0;
4318}
4319
4320PyObject *PyUnicode_TranslateCharmap(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004321 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004322 PyObject *mapping,
4323 const char *errors)
4324{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004325 /* output object */
4326 PyObject *res = NULL;
4327 /* pointers to the beginning and end+1 of input */
4328 const Py_UNICODE *startp = p;
4329 const Py_UNICODE *endp = p + size;
4330 /* pointer into the output */
4331 Py_UNICODE *str;
4332 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004333 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004334 char *reason = "character maps to <undefined>";
4335 PyObject *errorHandler = NULL;
4336 PyObject *exc = NULL;
4337 /* the following variable is used for caching string comparisons
4338 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
4339 * 3=ignore, 4=xmlcharrefreplace */
4340 int known_errorHandler = -1;
4341
Guido van Rossumd57fd912000-03-10 22:53:23 +00004342 if (mapping == NULL) {
4343 PyErr_BadArgument();
4344 return NULL;
4345 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004346
4347 /* allocate enough for a simple 1:1 translation without
4348 replacements, if we need more, we'll resize */
4349 res = PyUnicode_FromUnicode(NULL, size);
4350 if (res == NULL)
Walter Dörwald4894c302003-10-24 14:25:28 +00004351 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004352 if (size == 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004353 return res;
4354 str = PyUnicode_AS_UNICODE(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004355
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004356 while (p<endp) {
4357 /* try to encode it */
4358 PyObject *x = NULL;
Walter Dörwald4894c302003-10-24 14:25:28 +00004359 if (charmaptranslate_output(startp, p, size, mapping, &res, &str, &x)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004360 Py_XDECREF(x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004361 goto onError;
4362 }
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00004363 Py_XDECREF(x);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004364 if (x!=Py_None) /* it worked => adjust input pointer */
4365 ++p;
4366 else { /* untranslatable character */
4367 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004368 Py_ssize_t repsize;
4369 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004370 Py_UNICODE *uni2;
4371 /* startpos for collecting untranslatable chars */
4372 const Py_UNICODE *collstart = p;
4373 const Py_UNICODE *collend = p+1;
4374 const Py_UNICODE *coll;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004375
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004376 /* find all untranslatable characters */
4377 while (collend < endp) {
Walter Dörwald4894c302003-10-24 14:25:28 +00004378 if (charmaptranslate_lookup(*collend, mapping, &x))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004379 goto onError;
4380 Py_XDECREF(x);
4381 if (x!=Py_None)
4382 break;
4383 ++collend;
4384 }
4385 /* cache callback name lookup
4386 * (if not done yet, i.e. it's the first error) */
4387 if (known_errorHandler==-1) {
4388 if ((errors==NULL) || (!strcmp(errors, "strict")))
4389 known_errorHandler = 1;
4390 else if (!strcmp(errors, "replace"))
4391 known_errorHandler = 2;
4392 else if (!strcmp(errors, "ignore"))
4393 known_errorHandler = 3;
4394 else if (!strcmp(errors, "xmlcharrefreplace"))
4395 known_errorHandler = 4;
4396 else
4397 known_errorHandler = 0;
4398 }
4399 switch (known_errorHandler) {
4400 case 1: /* strict */
4401 raise_translate_exception(&exc, startp, size, collstart-startp, collend-startp, reason);
4402 goto onError;
4403 case 2: /* replace */
4404 /* No need to check for space, this is a 1:1 replacement */
4405 for (coll = collstart; coll<collend; ++coll)
4406 *str++ = '?';
4407 /* fall through */
4408 case 3: /* ignore */
4409 p = collend;
4410 break;
4411 case 4: /* xmlcharrefreplace */
4412 /* generate replacement (temporarily (mis)uses p) */
4413 for (p = collstart; p < collend; ++p) {
4414 char buffer[2+29+1+1];
4415 char *cp;
4416 sprintf(buffer, "&#%d;", (int)*p);
Walter Dörwald4894c302003-10-24 14:25:28 +00004417 if (charmaptranslate_makespace(&res, &str,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004418 (str-PyUnicode_AS_UNICODE(res))+strlen(buffer)+(endp-collend)))
4419 goto onError;
4420 for (cp = buffer; *cp; ++cp)
4421 *str++ = *cp;
4422 }
4423 p = collend;
4424 break;
4425 default:
4426 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
4427 reason, startp, size, &exc,
4428 collstart-startp, collend-startp, &newpos);
4429 if (repunicode == NULL)
4430 goto onError;
4431 /* generate replacement */
4432 repsize = PyUnicode_GET_SIZE(repunicode);
Walter Dörwald4894c302003-10-24 14:25:28 +00004433 if (charmaptranslate_makespace(&res, &str,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004434 (str-PyUnicode_AS_UNICODE(res))+repsize+(endp-collend))) {
4435 Py_DECREF(repunicode);
4436 goto onError;
4437 }
4438 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2)
4439 *str++ = *uni2;
4440 p = startp + newpos;
4441 Py_DECREF(repunicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004442 }
4443 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004444 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004445 /* Resize if we allocated to much */
4446 respos = str-PyUnicode_AS_UNICODE(res);
Walter Dörwald4894c302003-10-24 14:25:28 +00004447 if (respos<PyUnicode_GET_SIZE(res)) {
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00004448 if (_PyUnicode_Resize(&res, respos) < 0)
Guido van Rossumfd4b9572000-04-10 13:51:10 +00004449 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004450 }
4451 Py_XDECREF(exc);
4452 Py_XDECREF(errorHandler);
4453 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004454
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004455 onError:
4456 Py_XDECREF(res);
4457 Py_XDECREF(exc);
4458 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004459 return NULL;
4460}
4461
4462PyObject *PyUnicode_Translate(PyObject *str,
4463 PyObject *mapping,
4464 const char *errors)
4465{
4466 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00004467
Guido van Rossumd57fd912000-03-10 22:53:23 +00004468 str = PyUnicode_FromObject(str);
4469 if (str == NULL)
4470 goto onError;
4471 result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str),
4472 PyUnicode_GET_SIZE(str),
4473 mapping,
4474 errors);
4475 Py_DECREF(str);
4476 return result;
Tim Petersced69f82003-09-16 20:30:58 +00004477
Guido van Rossumd57fd912000-03-10 22:53:23 +00004478 onError:
4479 Py_XDECREF(str);
4480 return NULL;
4481}
Tim Petersced69f82003-09-16 20:30:58 +00004482
Guido van Rossum9e896b32000-04-05 20:11:21 +00004483/* --- Decimal Encoder ---------------------------------------------------- */
4484
4485int PyUnicode_EncodeDecimal(Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004486 Py_ssize_t length,
Guido van Rossum9e896b32000-04-05 20:11:21 +00004487 char *output,
4488 const char *errors)
4489{
4490 Py_UNICODE *p, *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004491 PyObject *errorHandler = NULL;
4492 PyObject *exc = NULL;
4493 const char *encoding = "decimal";
4494 const char *reason = "invalid decimal Unicode string";
4495 /* the following variable is used for caching string comparisons
4496 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
4497 int known_errorHandler = -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00004498
4499 if (output == NULL) {
4500 PyErr_BadArgument();
4501 return -1;
4502 }
4503
4504 p = s;
4505 end = s + length;
4506 while (p < end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004507 register Py_UNICODE ch = *p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00004508 int decimal;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004509 PyObject *repunicode;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004510 Py_ssize_t repsize;
4511 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004512 Py_UNICODE *uni2;
4513 Py_UNICODE *collstart;
4514 Py_UNICODE *collend;
Tim Petersced69f82003-09-16 20:30:58 +00004515
Guido van Rossum9e896b32000-04-05 20:11:21 +00004516 if (Py_UNICODE_ISSPACE(ch)) {
4517 *output++ = ' ';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004518 ++p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00004519 continue;
4520 }
4521 decimal = Py_UNICODE_TODECIMAL(ch);
4522 if (decimal >= 0) {
4523 *output++ = '0' + decimal;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004524 ++p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00004525 continue;
4526 }
Guido van Rossumba477042000-04-06 18:18:10 +00004527 if (0 < ch && ch < 256) {
Guido van Rossum42c29aa2000-05-03 23:58:29 +00004528 *output++ = (char)ch;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004529 ++p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00004530 continue;
4531 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004532 /* All other characters are considered unencodable */
4533 collstart = p;
4534 collend = p+1;
4535 while (collend < end) {
4536 if ((0 < *collend && *collend < 256) ||
4537 !Py_UNICODE_ISSPACE(*collend) ||
4538 Py_UNICODE_TODECIMAL(*collend))
4539 break;
Guido van Rossum9e896b32000-04-05 20:11:21 +00004540 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004541 /* cache callback name lookup
4542 * (if not done yet, i.e. it's the first error) */
4543 if (known_errorHandler==-1) {
4544 if ((errors==NULL) || (!strcmp(errors, "strict")))
4545 known_errorHandler = 1;
4546 else if (!strcmp(errors, "replace"))
4547 known_errorHandler = 2;
4548 else if (!strcmp(errors, "ignore"))
4549 known_errorHandler = 3;
4550 else if (!strcmp(errors, "xmlcharrefreplace"))
4551 known_errorHandler = 4;
4552 else
4553 known_errorHandler = 0;
4554 }
4555 switch (known_errorHandler) {
4556 case 1: /* strict */
4557 raise_encode_exception(&exc, encoding, s, length, collstart-s, collend-s, reason);
4558 goto onError;
4559 case 2: /* replace */
4560 for (p = collstart; p < collend; ++p)
4561 *output++ = '?';
4562 /* fall through */
4563 case 3: /* ignore */
4564 p = collend;
4565 break;
4566 case 4: /* xmlcharrefreplace */
4567 /* generate replacement (temporarily (mis)uses p) */
4568 for (p = collstart; p < collend; ++p)
4569 output += sprintf(output, "&#%d;", (int)*p);
4570 p = collend;
4571 break;
4572 default:
4573 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
4574 encoding, reason, s, length, &exc,
4575 collstart-s, collend-s, &newpos);
4576 if (repunicode == NULL)
4577 goto onError;
4578 /* generate replacement */
4579 repsize = PyUnicode_GET_SIZE(repunicode);
4580 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
4581 Py_UNICODE ch = *uni2;
4582 if (Py_UNICODE_ISSPACE(ch))
4583 *output++ = ' ';
4584 else {
4585 decimal = Py_UNICODE_TODECIMAL(ch);
4586 if (decimal >= 0)
4587 *output++ = '0' + decimal;
4588 else if (0 < ch && ch < 256)
4589 *output++ = (char)ch;
4590 else {
4591 Py_DECREF(repunicode);
4592 raise_encode_exception(&exc, encoding,
4593 s, length, collstart-s, collend-s, reason);
4594 goto onError;
4595 }
4596 }
4597 }
4598 p = s + newpos;
4599 Py_DECREF(repunicode);
Guido van Rossum9e896b32000-04-05 20:11:21 +00004600 }
4601 }
4602 /* 0-terminate the output string */
4603 *output++ = '\0';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004604 Py_XDECREF(exc);
4605 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00004606 return 0;
4607
4608 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004609 Py_XDECREF(exc);
4610 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00004611 return -1;
4612}
4613
Guido van Rossumd57fd912000-03-10 22:53:23 +00004614/* --- Helpers ------------------------------------------------------------ */
4615
Thomas Wouters477c8d52006-05-27 19:21:47 +00004616#define STRINGLIB_CHAR Py_UNICODE
4617
4618#define STRINGLIB_LEN PyUnicode_GET_SIZE
4619#define STRINGLIB_NEW PyUnicode_FromUnicode
4620#define STRINGLIB_STR PyUnicode_AS_UNICODE
4621
4622Py_LOCAL_INLINE(int)
4623STRINGLIB_CMP(const Py_UNICODE* str, const Py_UNICODE* other, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004624{
Thomas Wouters477c8d52006-05-27 19:21:47 +00004625 if (str[0] != other[0])
4626 return 1;
4627 return memcmp((void*) str, (void*) other, len * sizeof(Py_UNICODE));
Guido van Rossumd57fd912000-03-10 22:53:23 +00004628}
4629
Thomas Wouters477c8d52006-05-27 19:21:47 +00004630#define STRINGLIB_EMPTY unicode_empty
4631
4632#include "stringlib/fastsearch.h"
4633
4634#include "stringlib/count.h"
4635#include "stringlib/find.h"
4636#include "stringlib/partition.h"
4637
4638/* helper macro to fixup start/end slice values */
4639#define FIX_START_END(obj) \
4640 if (start < 0) \
4641 start += (obj)->length; \
4642 if (start < 0) \
4643 start = 0; \
4644 if (end > (obj)->length) \
4645 end = (obj)->length; \
4646 if (end < 0) \
4647 end += (obj)->length; \
4648 if (end < 0) \
4649 end = 0;
4650
Martin v. Löwis18e16552006-02-15 17:27:45 +00004651Py_ssize_t PyUnicode_Count(PyObject *str,
Thomas Wouters477c8d52006-05-27 19:21:47 +00004652 PyObject *substr,
4653 Py_ssize_t start,
4654 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004655{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004656 Py_ssize_t result;
Thomas Wouters477c8d52006-05-27 19:21:47 +00004657 PyUnicodeObject* str_obj;
4658 PyUnicodeObject* sub_obj;
Tim Petersced69f82003-09-16 20:30:58 +00004659
Thomas Wouters477c8d52006-05-27 19:21:47 +00004660 str_obj = (PyUnicodeObject*) PyUnicode_FromObject(str);
4661 if (!str_obj)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004662 return -1;
Thomas Wouters477c8d52006-05-27 19:21:47 +00004663 sub_obj = (PyUnicodeObject*) PyUnicode_FromObject(substr);
4664 if (!sub_obj) {
4665 Py_DECREF(str_obj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004666 return -1;
4667 }
Tim Petersced69f82003-09-16 20:30:58 +00004668
Thomas Wouters477c8d52006-05-27 19:21:47 +00004669 FIX_START_END(str_obj);
Tim Petersced69f82003-09-16 20:30:58 +00004670
Thomas Wouters477c8d52006-05-27 19:21:47 +00004671 result = stringlib_count(
4672 str_obj->str + start, end - start, sub_obj->str, sub_obj->length
4673 );
4674
4675 Py_DECREF(sub_obj);
4676 Py_DECREF(str_obj);
4677
Guido van Rossumd57fd912000-03-10 22:53:23 +00004678 return result;
4679}
4680
Martin v. Löwis18e16552006-02-15 17:27:45 +00004681Py_ssize_t PyUnicode_Find(PyObject *str,
Thomas Wouters477c8d52006-05-27 19:21:47 +00004682 PyObject *sub,
4683 Py_ssize_t start,
4684 Py_ssize_t end,
4685 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004686{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004687 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00004688
Guido van Rossumd57fd912000-03-10 22:53:23 +00004689 str = PyUnicode_FromObject(str);
Thomas Wouters477c8d52006-05-27 19:21:47 +00004690 if (!str)
Marc-André Lemburg4da6fd62002-05-29 11:33:13 +00004691 return -2;
Thomas Wouters477c8d52006-05-27 19:21:47 +00004692 sub = PyUnicode_FromObject(sub);
4693 if (!sub) {
Marc-André Lemburg41644392002-05-29 13:46:29 +00004694 Py_DECREF(str);
Marc-André Lemburg4da6fd62002-05-29 11:33:13 +00004695 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004696 }
Tim Petersced69f82003-09-16 20:30:58 +00004697
Thomas Wouters477c8d52006-05-27 19:21:47 +00004698 if (direction > 0)
4699 result = stringlib_find_slice(
4700 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
4701 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
4702 start, end
4703 );
4704 else
4705 result = stringlib_rfind_slice(
4706 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
4707 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
4708 start, end
4709 );
4710
Guido van Rossumd57fd912000-03-10 22:53:23 +00004711 Py_DECREF(str);
Thomas Wouters477c8d52006-05-27 19:21:47 +00004712 Py_DECREF(sub);
4713
Guido van Rossumd57fd912000-03-10 22:53:23 +00004714 return result;
4715}
4716
Tim Petersced69f82003-09-16 20:30:58 +00004717static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004718int tailmatch(PyUnicodeObject *self,
4719 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004720 Py_ssize_t start,
4721 Py_ssize_t end,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004722 int direction)
4723{
Guido van Rossumd57fd912000-03-10 22:53:23 +00004724 if (substring->length == 0)
4725 return 1;
4726
Thomas Wouters477c8d52006-05-27 19:21:47 +00004727 FIX_START_END(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004728
4729 end -= substring->length;
4730 if (end < start)
4731 return 0;
4732
4733 if (direction > 0) {
4734 if (Py_UNICODE_MATCH(self, end, substring))
4735 return 1;
4736 } else {
4737 if (Py_UNICODE_MATCH(self, start, substring))
4738 return 1;
4739 }
4740
4741 return 0;
4742}
4743
Martin v. Löwis18e16552006-02-15 17:27:45 +00004744Py_ssize_t PyUnicode_Tailmatch(PyObject *str,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004745 PyObject *substr,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004746 Py_ssize_t start,
4747 Py_ssize_t end,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004748 int direction)
4749{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004750 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00004751
Guido van Rossumd57fd912000-03-10 22:53:23 +00004752 str = PyUnicode_FromObject(str);
4753 if (str == NULL)
4754 return -1;
4755 substr = PyUnicode_FromObject(substr);
4756 if (substr == NULL) {
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00004757 Py_DECREF(str);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004758 return -1;
4759 }
Tim Petersced69f82003-09-16 20:30:58 +00004760
Guido van Rossumd57fd912000-03-10 22:53:23 +00004761 result = tailmatch((PyUnicodeObject *)str,
4762 (PyUnicodeObject *)substr,
4763 start, end, direction);
4764 Py_DECREF(str);
4765 Py_DECREF(substr);
4766 return result;
4767}
4768
Guido van Rossumd57fd912000-03-10 22:53:23 +00004769/* Apply fixfct filter to the Unicode object self and return a
4770 reference to the modified object */
4771
Tim Petersced69f82003-09-16 20:30:58 +00004772static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004773PyObject *fixup(PyUnicodeObject *self,
4774 int (*fixfct)(PyUnicodeObject *s))
4775{
4776
4777 PyUnicodeObject *u;
4778
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004779 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004780 if (u == NULL)
4781 return NULL;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004782
4783 Py_UNICODE_COPY(u->str, self->str, self->length);
4784
Tim Peters7a29bd52001-09-12 03:03:31 +00004785 if (!fixfct(u) && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004786 /* fixfct should return TRUE if it modified the buffer. If
4787 FALSE, return a reference to the original buffer instead
4788 (to save space, not time) */
4789 Py_INCREF(self);
4790 Py_DECREF(u);
4791 return (PyObject*) self;
4792 }
4793 return (PyObject*) u;
4794}
4795
Tim Petersced69f82003-09-16 20:30:58 +00004796static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004797int fixupper(PyUnicodeObject *self)
4798{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004799 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004800 Py_UNICODE *s = self->str;
4801 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00004802
Guido van Rossumd57fd912000-03-10 22:53:23 +00004803 while (len-- > 0) {
4804 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00004805
Guido van Rossumd57fd912000-03-10 22:53:23 +00004806 ch = Py_UNICODE_TOUPPER(*s);
4807 if (ch != *s) {
4808 status = 1;
4809 *s = ch;
4810 }
4811 s++;
4812 }
4813
4814 return status;
4815}
4816
Tim Petersced69f82003-09-16 20:30:58 +00004817static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004818int fixlower(PyUnicodeObject *self)
4819{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004820 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004821 Py_UNICODE *s = self->str;
4822 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00004823
Guido van Rossumd57fd912000-03-10 22:53:23 +00004824 while (len-- > 0) {
4825 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00004826
Guido van Rossumd57fd912000-03-10 22:53:23 +00004827 ch = Py_UNICODE_TOLOWER(*s);
4828 if (ch != *s) {
4829 status = 1;
4830 *s = ch;
4831 }
4832 s++;
4833 }
4834
4835 return status;
4836}
4837
Tim Petersced69f82003-09-16 20:30:58 +00004838static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004839int fixswapcase(PyUnicodeObject *self)
4840{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004841 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004842 Py_UNICODE *s = self->str;
4843 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00004844
Guido van Rossumd57fd912000-03-10 22:53:23 +00004845 while (len-- > 0) {
4846 if (Py_UNICODE_ISUPPER(*s)) {
4847 *s = Py_UNICODE_TOLOWER(*s);
4848 status = 1;
4849 } else if (Py_UNICODE_ISLOWER(*s)) {
4850 *s = Py_UNICODE_TOUPPER(*s);
4851 status = 1;
4852 }
4853 s++;
4854 }
4855
4856 return status;
4857}
4858
Tim Petersced69f82003-09-16 20:30:58 +00004859static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004860int fixcapitalize(PyUnicodeObject *self)
4861{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004862 Py_ssize_t len = self->length;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00004863 Py_UNICODE *s = self->str;
4864 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00004865
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00004866 if (len == 0)
4867 return 0;
4868 if (Py_UNICODE_ISLOWER(*s)) {
4869 *s = Py_UNICODE_TOUPPER(*s);
4870 status = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004871 }
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00004872 s++;
4873 while (--len > 0) {
4874 if (Py_UNICODE_ISUPPER(*s)) {
4875 *s = Py_UNICODE_TOLOWER(*s);
4876 status = 1;
4877 }
4878 s++;
4879 }
4880 return status;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004881}
4882
4883static
4884int fixtitle(PyUnicodeObject *self)
4885{
4886 register Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
4887 register Py_UNICODE *e;
4888 int previous_is_cased;
4889
4890 /* Shortcut for single character strings */
4891 if (PyUnicode_GET_SIZE(self) == 1) {
4892 Py_UNICODE ch = Py_UNICODE_TOTITLE(*p);
4893 if (*p != ch) {
4894 *p = ch;
4895 return 1;
4896 }
4897 else
4898 return 0;
4899 }
Tim Petersced69f82003-09-16 20:30:58 +00004900
Guido van Rossumd57fd912000-03-10 22:53:23 +00004901 e = p + PyUnicode_GET_SIZE(self);
4902 previous_is_cased = 0;
4903 for (; p < e; p++) {
4904 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00004905
Guido van Rossumd57fd912000-03-10 22:53:23 +00004906 if (previous_is_cased)
4907 *p = Py_UNICODE_TOLOWER(ch);
4908 else
4909 *p = Py_UNICODE_TOTITLE(ch);
Tim Petersced69f82003-09-16 20:30:58 +00004910
4911 if (Py_UNICODE_ISLOWER(ch) ||
4912 Py_UNICODE_ISUPPER(ch) ||
Guido van Rossumd57fd912000-03-10 22:53:23 +00004913 Py_UNICODE_ISTITLE(ch))
4914 previous_is_cased = 1;
4915 else
4916 previous_is_cased = 0;
4917 }
4918 return 1;
4919}
4920
Tim Peters8ce9f162004-08-27 01:49:32 +00004921PyObject *
4922PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004923{
Tim Peters8ce9f162004-08-27 01:49:32 +00004924 PyObject *internal_separator = NULL;
Skip Montanaro6543b452004-09-16 03:28:13 +00004925 const Py_UNICODE blank = ' ';
4926 const Py_UNICODE *sep = &blank;
Thomas Wouters477c8d52006-05-27 19:21:47 +00004927 Py_ssize_t seplen = 1;
Tim Peters05eba1f2004-08-27 21:32:02 +00004928 PyUnicodeObject *res = NULL; /* the result */
Thomas Wouters477c8d52006-05-27 19:21:47 +00004929 Py_ssize_t res_alloc = 100; /* # allocated bytes for string in res */
4930 Py_ssize_t res_used; /* # used bytes */
Tim Peters05eba1f2004-08-27 21:32:02 +00004931 Py_UNICODE *res_p; /* pointer to free byte in res's string area */
4932 PyObject *fseq; /* PySequence_Fast(seq) */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004933 Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */
Tim Peters8ce9f162004-08-27 01:49:32 +00004934 PyObject *item;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004935 Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004936
Tim Peters05eba1f2004-08-27 21:32:02 +00004937 fseq = PySequence_Fast(seq, "");
4938 if (fseq == NULL) {
Tim Peters05eba1f2004-08-27 21:32:02 +00004939 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +00004940 }
4941
Tim Peters91879ab2004-08-27 22:35:44 +00004942 /* Grrrr. A codec may be invoked to convert str objects to
4943 * Unicode, and so it's possible to call back into Python code
4944 * during PyUnicode_FromObject(), and so it's possible for a sick
4945 * codec to change the size of fseq (if seq is a list). Therefore
4946 * we have to keep refetching the size -- can't assume seqlen
4947 * is invariant.
4948 */
Tim Peters05eba1f2004-08-27 21:32:02 +00004949 seqlen = PySequence_Fast_GET_SIZE(fseq);
4950 /* If empty sequence, return u"". */
4951 if (seqlen == 0) {
4952 res = _PyUnicode_New(0); /* empty sequence; return u"" */
4953 goto Done;
4954 }
4955 /* If singleton sequence with an exact Unicode, return that. */
4956 if (seqlen == 1) {
4957 item = PySequence_Fast_GET_ITEM(fseq, 0);
4958 if (PyUnicode_CheckExact(item)) {
4959 Py_INCREF(item);
4960 res = (PyUnicodeObject *)item;
4961 goto Done;
4962 }
Tim Peters8ce9f162004-08-27 01:49:32 +00004963 }
4964
Tim Peters05eba1f2004-08-27 21:32:02 +00004965 /* At least two items to join, or one that isn't exact Unicode. */
4966 if (seqlen > 1) {
4967 /* Set up sep and seplen -- they're needed. */
4968 if (separator == NULL) {
4969 sep = &blank;
4970 seplen = 1;
4971 }
4972 else {
4973 internal_separator = PyUnicode_FromObject(separator);
4974 if (internal_separator == NULL)
4975 goto onError;
4976 sep = PyUnicode_AS_UNICODE(internal_separator);
4977 seplen = PyUnicode_GET_SIZE(internal_separator);
Tim Peters91879ab2004-08-27 22:35:44 +00004978 /* In case PyUnicode_FromObject() mutated seq. */
4979 seqlen = PySequence_Fast_GET_SIZE(fseq);
Tim Peters05eba1f2004-08-27 21:32:02 +00004980 }
4981 }
4982
4983 /* Get space. */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004984 res = _PyUnicode_New(res_alloc);
Tim Peters05eba1f2004-08-27 21:32:02 +00004985 if (res == NULL)
Tim Peters8ce9f162004-08-27 01:49:32 +00004986 goto onError;
Tim Peters05eba1f2004-08-27 21:32:02 +00004987 res_p = PyUnicode_AS_UNICODE(res);
4988 res_used = 0;
Tim Peters8ce9f162004-08-27 01:49:32 +00004989
Tim Peters05eba1f2004-08-27 21:32:02 +00004990 for (i = 0; i < seqlen; ++i) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00004991 Py_ssize_t itemlen;
4992 Py_ssize_t new_res_used;
Tim Peters05eba1f2004-08-27 21:32:02 +00004993
4994 item = PySequence_Fast_GET_ITEM(fseq, i);
4995 /* Convert item to Unicode. */
4996 if (! PyUnicode_Check(item) && ! PyString_Check(item)) {
4997 PyErr_Format(PyExc_TypeError,
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004998 "sequence item %zd: expected string or Unicode,"
Tim Peters05eba1f2004-08-27 21:32:02 +00004999 " %.80s found",
5000 i, item->ob_type->tp_name);
Tim Peters2cfe3682001-05-05 05:36:48 +00005001 goto onError;
Tim Peters8ce9f162004-08-27 01:49:32 +00005002 }
Tim Peters05eba1f2004-08-27 21:32:02 +00005003 item = PyUnicode_FromObject(item);
5004 if (item == NULL)
5005 goto onError;
5006 /* We own a reference to item from here on. */
Tim Petersced69f82003-09-16 20:30:58 +00005007
Tim Peters91879ab2004-08-27 22:35:44 +00005008 /* In case PyUnicode_FromObject() mutated seq. */
5009 seqlen = PySequence_Fast_GET_SIZE(fseq);
5010
Tim Peters8ce9f162004-08-27 01:49:32 +00005011 /* Make sure we have enough space for the separator and the item. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005012 itemlen = PyUnicode_GET_SIZE(item);
Tim Peters05eba1f2004-08-27 21:32:02 +00005013 new_res_used = res_used + itemlen;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005014 if (new_res_used < 0)
Tim Peters8ce9f162004-08-27 01:49:32 +00005015 goto Overflow;
Tim Peters05eba1f2004-08-27 21:32:02 +00005016 if (i < seqlen - 1) {
5017 new_res_used += seplen;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005018 if (new_res_used < 0)
Tim Peters05eba1f2004-08-27 21:32:02 +00005019 goto Overflow;
5020 }
5021 if (new_res_used > res_alloc) {
5022 /* double allocated size until it's big enough */
Tim Peters8ce9f162004-08-27 01:49:32 +00005023 do {
Tim Peters05eba1f2004-08-27 21:32:02 +00005024 res_alloc += res_alloc;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005025 if (res_alloc <= 0)
Tim Peters8ce9f162004-08-27 01:49:32 +00005026 goto Overflow;
Tim Peters05eba1f2004-08-27 21:32:02 +00005027 } while (new_res_used > res_alloc);
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005028 if (_PyUnicode_Resize(&res, res_alloc) < 0) {
Marc-André Lemburg3508e302001-09-20 17:22:58 +00005029 Py_DECREF(item);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005030 goto onError;
Marc-André Lemburg3508e302001-09-20 17:22:58 +00005031 }
Tim Peters05eba1f2004-08-27 21:32:02 +00005032 res_p = PyUnicode_AS_UNICODE(res) + res_used;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005033 }
Tim Peters05eba1f2004-08-27 21:32:02 +00005034
5035 /* Copy item, and maybe the separator. */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005036 Py_UNICODE_COPY(res_p, PyUnicode_AS_UNICODE(item), itemlen);
Tim Peters05eba1f2004-08-27 21:32:02 +00005037 res_p += itemlen;
5038 if (i < seqlen - 1) {
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005039 Py_UNICODE_COPY(res_p, sep, seplen);
Tim Peters05eba1f2004-08-27 21:32:02 +00005040 res_p += seplen;
5041 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005042 Py_DECREF(item);
Tim Peters05eba1f2004-08-27 21:32:02 +00005043 res_used = new_res_used;
5044 }
Tim Peters8ce9f162004-08-27 01:49:32 +00005045
Tim Peters05eba1f2004-08-27 21:32:02 +00005046 /* Shrink res to match the used area; this probably can't fail,
5047 * but it's cheap to check.
5048 */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005049 if (_PyUnicode_Resize(&res, res_used) < 0)
Tim Peters8ce9f162004-08-27 01:49:32 +00005050 goto onError;
5051
5052 Done:
5053 Py_XDECREF(internal_separator);
Tim Peters05eba1f2004-08-27 21:32:02 +00005054 Py_DECREF(fseq);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005055 return (PyObject *)res;
5056
Tim Peters8ce9f162004-08-27 01:49:32 +00005057 Overflow:
5058 PyErr_SetString(PyExc_OverflowError,
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005059 "join() result is too long for a Python string");
Tim Peters8ce9f162004-08-27 01:49:32 +00005060 Py_DECREF(item);
5061 /* fall through */
5062
Guido van Rossumd57fd912000-03-10 22:53:23 +00005063 onError:
Tim Peters8ce9f162004-08-27 01:49:32 +00005064 Py_XDECREF(internal_separator);
Tim Peters05eba1f2004-08-27 21:32:02 +00005065 Py_DECREF(fseq);
Tim Peters8ce9f162004-08-27 01:49:32 +00005066 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005067 return NULL;
5068}
5069
Tim Petersced69f82003-09-16 20:30:58 +00005070static
5071PyUnicodeObject *pad(PyUnicodeObject *self,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005072 Py_ssize_t left,
5073 Py_ssize_t right,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005074 Py_UNICODE fill)
5075{
5076 PyUnicodeObject *u;
5077
5078 if (left < 0)
5079 left = 0;
5080 if (right < 0)
5081 right = 0;
5082
Tim Peters7a29bd52001-09-12 03:03:31 +00005083 if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005084 Py_INCREF(self);
5085 return self;
5086 }
5087
5088 u = _PyUnicode_New(left + self->length + right);
5089 if (u) {
5090 if (left)
5091 Py_UNICODE_FILL(u->str, fill, left);
5092 Py_UNICODE_COPY(u->str + left, self->str, self->length);
5093 if (right)
5094 Py_UNICODE_FILL(u->str + left + self->length, fill, right);
5095 }
5096
5097 return u;
5098}
5099
5100#define SPLIT_APPEND(data, left, right) \
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005101 str = PyUnicode_FromUnicode((data) + (left), (right) - (left)); \
Guido van Rossumd57fd912000-03-10 22:53:23 +00005102 if (!str) \
5103 goto onError; \
5104 if (PyList_Append(list, str)) { \
5105 Py_DECREF(str); \
5106 goto onError; \
5107 } \
5108 else \
5109 Py_DECREF(str);
5110
5111static
5112PyObject *split_whitespace(PyUnicodeObject *self,
5113 PyObject *list,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005114 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005115{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005116 register Py_ssize_t i;
5117 register Py_ssize_t j;
5118 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005119 PyObject *str;
5120
5121 for (i = j = 0; i < len; ) {
5122 /* find a token */
5123 while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
5124 i++;
5125 j = i;
5126 while (i < len && !Py_UNICODE_ISSPACE(self->str[i]))
5127 i++;
5128 if (j < i) {
5129 if (maxcount-- <= 0)
5130 break;
5131 SPLIT_APPEND(self->str, j, i);
5132 while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
5133 i++;
5134 j = i;
5135 }
5136 }
5137 if (j < len) {
5138 SPLIT_APPEND(self->str, j, len);
5139 }
5140 return list;
5141
5142 onError:
5143 Py_DECREF(list);
5144 return NULL;
5145}
5146
5147PyObject *PyUnicode_Splitlines(PyObject *string,
Guido van Rossum86662912000-04-11 15:38:46 +00005148 int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005149{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005150 register Py_ssize_t i;
5151 register Py_ssize_t j;
5152 Py_ssize_t len;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005153 PyObject *list;
5154 PyObject *str;
5155 Py_UNICODE *data;
5156
5157 string = PyUnicode_FromObject(string);
5158 if (string == NULL)
5159 return NULL;
5160 data = PyUnicode_AS_UNICODE(string);
5161 len = PyUnicode_GET_SIZE(string);
5162
Guido van Rossumd57fd912000-03-10 22:53:23 +00005163 list = PyList_New(0);
5164 if (!list)
5165 goto onError;
5166
5167 for (i = j = 0; i < len; ) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00005168 Py_ssize_t eol;
Tim Petersced69f82003-09-16 20:30:58 +00005169
Guido van Rossumd57fd912000-03-10 22:53:23 +00005170 /* Find a line and append it */
Thomas Wouters477c8d52006-05-27 19:21:47 +00005171 while (i < len && !BLOOM_LINEBREAK(data[i]))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005172 i++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005173
5174 /* Skip the line break reading CRLF as one line break */
Guido van Rossum86662912000-04-11 15:38:46 +00005175 eol = i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005176 if (i < len) {
5177 if (data[i] == '\r' && i + 1 < len &&
5178 data[i+1] == '\n')
5179 i += 2;
5180 else
5181 i++;
Guido van Rossum86662912000-04-11 15:38:46 +00005182 if (keepends)
5183 eol = i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005184 }
Guido van Rossum86662912000-04-11 15:38:46 +00005185 SPLIT_APPEND(data, j, eol);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005186 j = i;
5187 }
5188 if (j < len) {
5189 SPLIT_APPEND(data, j, len);
5190 }
5191
5192 Py_DECREF(string);
5193 return list;
5194
5195 onError:
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00005196 Py_XDECREF(list);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005197 Py_DECREF(string);
5198 return NULL;
5199}
5200
Tim Petersced69f82003-09-16 20:30:58 +00005201static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005202PyObject *split_char(PyUnicodeObject *self,
5203 PyObject *list,
5204 Py_UNICODE ch,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005205 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005206{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005207 register Py_ssize_t i;
5208 register Py_ssize_t j;
5209 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005210 PyObject *str;
5211
5212 for (i = j = 0; i < len; ) {
5213 if (self->str[i] == ch) {
5214 if (maxcount-- <= 0)
5215 break;
5216 SPLIT_APPEND(self->str, j, i);
5217 i = j = i + 1;
5218 } else
5219 i++;
5220 }
5221 if (j <= len) {
5222 SPLIT_APPEND(self->str, j, len);
5223 }
5224 return list;
5225
5226 onError:
5227 Py_DECREF(list);
5228 return NULL;
5229}
5230
Tim Petersced69f82003-09-16 20:30:58 +00005231static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005232PyObject *split_substring(PyUnicodeObject *self,
5233 PyObject *list,
5234 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005235 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005236{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005237 register Py_ssize_t i;
5238 register Py_ssize_t j;
5239 Py_ssize_t len = self->length;
5240 Py_ssize_t sublen = substring->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005241 PyObject *str;
5242
Guido van Rossumcda4f9a2000-12-19 02:23:19 +00005243 for (i = j = 0; i <= len - sublen; ) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005244 if (Py_UNICODE_MATCH(self, i, substring)) {
5245 if (maxcount-- <= 0)
5246 break;
5247 SPLIT_APPEND(self->str, j, i);
5248 i = j = i + sublen;
5249 } else
5250 i++;
5251 }
5252 if (j <= len) {
5253 SPLIT_APPEND(self->str, j, len);
5254 }
5255 return list;
5256
5257 onError:
5258 Py_DECREF(list);
5259 return NULL;
5260}
5261
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005262static
5263PyObject *rsplit_whitespace(PyUnicodeObject *self,
5264 PyObject *list,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005265 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005266{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005267 register Py_ssize_t i;
5268 register Py_ssize_t j;
5269 Py_ssize_t len = self->length;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005270 PyObject *str;
5271
5272 for (i = j = len - 1; i >= 0; ) {
5273 /* find a token */
5274 while (i >= 0 && Py_UNICODE_ISSPACE(self->str[i]))
5275 i--;
5276 j = i;
5277 while (i >= 0 && !Py_UNICODE_ISSPACE(self->str[i]))
5278 i--;
5279 if (j > i) {
5280 if (maxcount-- <= 0)
5281 break;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005282 SPLIT_APPEND(self->str, i + 1, j + 1);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005283 while (i >= 0 && Py_UNICODE_ISSPACE(self->str[i]))
5284 i--;
5285 j = i;
5286 }
5287 }
5288 if (j >= 0) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00005289 SPLIT_APPEND(self->str, 0, j + 1);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005290 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00005291 if (PyList_Reverse(list) < 0)
5292 goto onError;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005293 return list;
5294
5295 onError:
5296 Py_DECREF(list);
5297 return NULL;
5298}
5299
5300static
5301PyObject *rsplit_char(PyUnicodeObject *self,
5302 PyObject *list,
5303 Py_UNICODE ch,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005304 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005305{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005306 register Py_ssize_t i;
5307 register Py_ssize_t j;
5308 Py_ssize_t len = self->length;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005309 PyObject *str;
5310
5311 for (i = j = len - 1; i >= 0; ) {
5312 if (self->str[i] == ch) {
5313 if (maxcount-- <= 0)
5314 break;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005315 SPLIT_APPEND(self->str, i + 1, j + 1);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005316 j = i = i - 1;
5317 } else
5318 i--;
5319 }
Hye-Shik Chang7fc4cf52003-12-23 09:10:16 +00005320 if (j >= -1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00005321 SPLIT_APPEND(self->str, 0, j + 1);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005322 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00005323 if (PyList_Reverse(list) < 0)
5324 goto onError;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005325 return list;
5326
5327 onError:
5328 Py_DECREF(list);
5329 return NULL;
5330}
5331
5332static
5333PyObject *rsplit_substring(PyUnicodeObject *self,
5334 PyObject *list,
5335 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005336 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005337{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005338 register Py_ssize_t i;
5339 register Py_ssize_t j;
5340 Py_ssize_t len = self->length;
5341 Py_ssize_t sublen = substring->length;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005342 PyObject *str;
5343
5344 for (i = len - sublen, j = len; i >= 0; ) {
5345 if (Py_UNICODE_MATCH(self, i, substring)) {
5346 if (maxcount-- <= 0)
5347 break;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005348 SPLIT_APPEND(self->str, i + sublen, j);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005349 j = i;
5350 i -= sublen;
5351 } else
5352 i--;
5353 }
5354 if (j >= 0) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00005355 SPLIT_APPEND(self->str, 0, j);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005356 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00005357 if (PyList_Reverse(list) < 0)
5358 goto onError;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005359 return list;
5360
5361 onError:
5362 Py_DECREF(list);
5363 return NULL;
5364}
5365
Guido van Rossumd57fd912000-03-10 22:53:23 +00005366#undef SPLIT_APPEND
5367
5368static
5369PyObject *split(PyUnicodeObject *self,
5370 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005371 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005372{
5373 PyObject *list;
5374
5375 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005376 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005377
5378 list = PyList_New(0);
5379 if (!list)
5380 return NULL;
5381
5382 if (substring == NULL)
5383 return split_whitespace(self,list,maxcount);
5384
5385 else if (substring->length == 1)
5386 return split_char(self,list,substring->str[0],maxcount);
5387
5388 else if (substring->length == 0) {
5389 Py_DECREF(list);
5390 PyErr_SetString(PyExc_ValueError, "empty separator");
5391 return NULL;
5392 }
5393 else
5394 return split_substring(self,list,substring,maxcount);
5395}
5396
Tim Petersced69f82003-09-16 20:30:58 +00005397static
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005398PyObject *rsplit(PyUnicodeObject *self,
5399 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005400 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005401{
5402 PyObject *list;
5403
5404 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005405 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005406
5407 list = PyList_New(0);
5408 if (!list)
5409 return NULL;
5410
5411 if (substring == NULL)
5412 return rsplit_whitespace(self,list,maxcount);
5413
5414 else if (substring->length == 1)
5415 return rsplit_char(self,list,substring->str[0],maxcount);
5416
5417 else if (substring->length == 0) {
5418 Py_DECREF(list);
5419 PyErr_SetString(PyExc_ValueError, "empty separator");
5420 return NULL;
5421 }
5422 else
5423 return rsplit_substring(self,list,substring,maxcount);
5424}
5425
5426static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005427PyObject *replace(PyUnicodeObject *self,
5428 PyUnicodeObject *str1,
5429 PyUnicodeObject *str2,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005430 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005431{
5432 PyUnicodeObject *u;
5433
5434 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005435 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005436
Thomas Wouters477c8d52006-05-27 19:21:47 +00005437 if (str1->length == str2->length) {
5438 /* same length */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005439 Py_ssize_t i;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005440 if (str1->length == 1) {
5441 /* replace characters */
5442 Py_UNICODE u1, u2;
5443 if (!findchar(self->str, self->length, str1->str[0]))
5444 goto nothing;
5445 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
5446 if (!u)
5447 return NULL;
5448 Py_UNICODE_COPY(u->str, self->str, self->length);
5449 u1 = str1->str[0];
5450 u2 = str2->str[0];
5451 for (i = 0; i < u->length; i++)
5452 if (u->str[i] == u1) {
5453 if (--maxcount < 0)
5454 break;
5455 u->str[i] = u2;
5456 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005457 } else {
Thomas Wouters477c8d52006-05-27 19:21:47 +00005458 i = fastsearch(
5459 self->str, self->length, str1->str, str1->length, FAST_SEARCH
Guido van Rossumd57fd912000-03-10 22:53:23 +00005460 );
Thomas Wouters477c8d52006-05-27 19:21:47 +00005461 if (i < 0)
5462 goto nothing;
5463 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
5464 if (!u)
5465 return NULL;
5466 Py_UNICODE_COPY(u->str, self->str, self->length);
5467 while (i <= self->length - str1->length)
5468 if (Py_UNICODE_MATCH(self, i, str1)) {
5469 if (--maxcount < 0)
5470 break;
5471 Py_UNICODE_COPY(u->str+i, str2->str, str2->length);
5472 i += str1->length;
5473 } else
5474 i++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005475 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005476 } else {
Thomas Wouters477c8d52006-05-27 19:21:47 +00005477
5478 Py_ssize_t n, i, j, e;
5479 Py_ssize_t product, new_size, delta;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005480 Py_UNICODE *p;
5481
5482 /* replace strings */
Thomas Wouters477c8d52006-05-27 19:21:47 +00005483 n = stringlib_count(self->str, self->length, str1->str, str1->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005484 if (n > maxcount)
5485 n = maxcount;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005486 if (n == 0)
5487 goto nothing;
5488 /* new_size = self->length + n * (str2->length - str1->length)); */
5489 delta = (str2->length - str1->length);
5490 if (delta == 0) {
5491 new_size = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005492 } else {
Thomas Wouters477c8d52006-05-27 19:21:47 +00005493 product = n * (str2->length - str1->length);
5494 if ((product / (str2->length - str1->length)) != n) {
5495 PyErr_SetString(PyExc_OverflowError,
5496 "replace string is too long");
5497 return NULL;
5498 }
5499 new_size = self->length + product;
5500 if (new_size < 0) {
5501 PyErr_SetString(PyExc_OverflowError,
5502 "replace string is too long");
5503 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005504 }
5505 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00005506 u = _PyUnicode_New(new_size);
5507 if (!u)
5508 return NULL;
5509 i = 0;
5510 p = u->str;
5511 e = self->length - str1->length;
5512 if (str1->length > 0) {
5513 while (n-- > 0) {
5514 /* look for next match */
5515 j = i;
5516 while (j <= e) {
5517 if (Py_UNICODE_MATCH(self, j, str1))
5518 break;
5519 j++;
5520 }
5521 if (j > i) {
5522 if (j > e)
5523 break;
5524 /* copy unchanged part [i:j] */
5525 Py_UNICODE_COPY(p, self->str+i, j-i);
5526 p += j - i;
5527 }
5528 /* copy substitution string */
5529 if (str2->length > 0) {
5530 Py_UNICODE_COPY(p, str2->str, str2->length);
5531 p += str2->length;
5532 }
5533 i = j + str1->length;
5534 }
5535 if (i < self->length)
5536 /* copy tail [i:] */
5537 Py_UNICODE_COPY(p, self->str+i, self->length-i);
5538 } else {
5539 /* interleave */
5540 while (n > 0) {
5541 Py_UNICODE_COPY(p, str2->str, str2->length);
5542 p += str2->length;
5543 if (--n <= 0)
5544 break;
5545 *p++ = self->str[i++];
5546 }
5547 Py_UNICODE_COPY(p, self->str+i, self->length-i);
5548 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005549 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005550 return (PyObject *) u;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005551
5552nothing:
5553 /* nothing to replace; return original string (when possible) */
5554 if (PyUnicode_CheckExact(self)) {
5555 Py_INCREF(self);
5556 return (PyObject *) self;
5557 }
5558 return PyUnicode_FromUnicode(self->str, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005559}
5560
5561/* --- Unicode Object Methods --------------------------------------------- */
5562
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005563PyDoc_STRVAR(title__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005564"S.title() -> unicode\n\
5565\n\
5566Return a titlecased version of S, i.e. words start with title case\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005567characters, all remaining cased characters have lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005568
5569static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005570unicode_title(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005571{
Guido van Rossumd57fd912000-03-10 22:53:23 +00005572 return fixup(self, fixtitle);
5573}
5574
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005575PyDoc_STRVAR(capitalize__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005576"S.capitalize() -> unicode\n\
5577\n\
5578Return a capitalized version of S, i.e. make the first character\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005579have upper case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005580
5581static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005582unicode_capitalize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005583{
Guido van Rossumd57fd912000-03-10 22:53:23 +00005584 return fixup(self, fixcapitalize);
5585}
5586
5587#if 0
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005588PyDoc_STRVAR(capwords__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005589"S.capwords() -> unicode\n\
5590\n\
5591Apply .capitalize() to all words in S and return the result with\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005592normalized whitespace (all whitespace strings are replaced by ' ').");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005593
5594static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005595unicode_capwords(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005596{
5597 PyObject *list;
5598 PyObject *item;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005599 Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005600
Guido van Rossumd57fd912000-03-10 22:53:23 +00005601 /* Split into words */
5602 list = split(self, NULL, -1);
5603 if (!list)
5604 return NULL;
5605
5606 /* Capitalize each word */
5607 for (i = 0; i < PyList_GET_SIZE(list); i++) {
5608 item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
5609 fixcapitalize);
5610 if (item == NULL)
5611 goto onError;
5612 Py_DECREF(PyList_GET_ITEM(list, i));
5613 PyList_SET_ITEM(list, i, item);
5614 }
5615
5616 /* Join the words to form a new string */
5617 item = PyUnicode_Join(NULL, list);
5618
5619onError:
5620 Py_DECREF(list);
5621 return (PyObject *)item;
5622}
5623#endif
5624
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00005625/* Argument converter. Coerces to a single unicode character */
5626
5627static int
5628convert_uc(PyObject *obj, void *addr)
5629{
5630 Py_UNICODE *fillcharloc = (Py_UNICODE *)addr;
5631 PyObject *uniobj;
5632 Py_UNICODE *unistr;
5633
5634 uniobj = PyUnicode_FromObject(obj);
5635 if (uniobj == NULL) {
5636 PyErr_SetString(PyExc_TypeError,
5637 "The fill character cannot be converted to Unicode");
5638 return 0;
5639 }
5640 if (PyUnicode_GET_SIZE(uniobj) != 1) {
5641 PyErr_SetString(PyExc_TypeError,
5642 "The fill character must be exactly one character long");
5643 Py_DECREF(uniobj);
5644 return 0;
5645 }
5646 unistr = PyUnicode_AS_UNICODE(uniobj);
5647 *fillcharloc = unistr[0];
5648 Py_DECREF(uniobj);
5649 return 1;
5650}
5651
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005652PyDoc_STRVAR(center__doc__,
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00005653"S.center(width[, fillchar]) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005654\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00005655Return S centered in a Unicode string of length width. Padding is\n\
5656done using the specified fill character (default is a space)");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005657
5658static PyObject *
5659unicode_center(PyUnicodeObject *self, PyObject *args)
5660{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005661 Py_ssize_t marg, left;
5662 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00005663 Py_UNICODE fillchar = ' ';
Guido van Rossumd57fd912000-03-10 22:53:23 +00005664
Thomas Woutersde017742006-02-16 19:34:37 +00005665 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005666 return NULL;
5667
Tim Peters7a29bd52001-09-12 03:03:31 +00005668 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005669 Py_INCREF(self);
5670 return (PyObject*) self;
5671 }
5672
5673 marg = width - self->length;
5674 left = marg / 2 + (marg & width & 1);
5675
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00005676 return (PyObject*) pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005677}
5678
Marc-André Lemburge5034372000-08-08 08:04:29 +00005679#if 0
5680
5681/* This code should go into some future Unicode collation support
5682 module. The basic comparison should compare ordinals on a naive
Trent Mick20abf572000-08-12 22:14:34 +00005683 basis (this is what Java does and thus JPython too). */
Marc-André Lemburge5034372000-08-08 08:04:29 +00005684
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005685/* speedy UTF-16 code point order comparison */
5686/* gleaned from: */
5687/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
5688
Marc-André Lemburge12896e2000-07-07 17:51:08 +00005689static short utf16Fixup[32] =
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005690{
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005691 0, 0, 0, 0, 0, 0, 0, 0,
Tim Petersced69f82003-09-16 20:30:58 +00005692 0, 0, 0, 0, 0, 0, 0, 0,
5693 0, 0, 0, 0, 0, 0, 0, 0,
Marc-André Lemburge12896e2000-07-07 17:51:08 +00005694 0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005695};
5696
Guido van Rossumd57fd912000-03-10 22:53:23 +00005697static int
5698unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
5699{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005700 Py_ssize_t len1, len2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005701
Guido van Rossumd57fd912000-03-10 22:53:23 +00005702 Py_UNICODE *s1 = str1->str;
5703 Py_UNICODE *s2 = str2->str;
5704
5705 len1 = str1->length;
5706 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00005707
Guido van Rossumd57fd912000-03-10 22:53:23 +00005708 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00005709 Py_UNICODE c1, c2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005710
5711 c1 = *s1++;
5712 c2 = *s2++;
Fredrik Lundh45714e92001-06-26 16:39:36 +00005713
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005714 if (c1 > (1<<11) * 26)
5715 c1 += utf16Fixup[c1>>11];
5716 if (c2 > (1<<11) * 26)
5717 c2 += utf16Fixup[c2>>11];
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005718 /* now c1 and c2 are in UTF-32-compatible order */
Fredrik Lundh45714e92001-06-26 16:39:36 +00005719
5720 if (c1 != c2)
5721 return (c1 < c2) ? -1 : 1;
Tim Petersced69f82003-09-16 20:30:58 +00005722
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005723 len1--; len2--;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005724 }
5725
5726 return (len1 < len2) ? -1 : (len1 != len2);
5727}
5728
Marc-André Lemburge5034372000-08-08 08:04:29 +00005729#else
5730
5731static int
5732unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
5733{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005734 register Py_ssize_t len1, len2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00005735
5736 Py_UNICODE *s1 = str1->str;
5737 Py_UNICODE *s2 = str2->str;
5738
5739 len1 = str1->length;
5740 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00005741
Marc-André Lemburge5034372000-08-08 08:04:29 +00005742 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00005743 Py_UNICODE c1, c2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00005744
Fredrik Lundh45714e92001-06-26 16:39:36 +00005745 c1 = *s1++;
5746 c2 = *s2++;
5747
5748 if (c1 != c2)
5749 return (c1 < c2) ? -1 : 1;
5750
Marc-André Lemburge5034372000-08-08 08:04:29 +00005751 len1--; len2--;
5752 }
5753
5754 return (len1 < len2) ? -1 : (len1 != len2);
5755}
5756
5757#endif
5758
Guido van Rossumd57fd912000-03-10 22:53:23 +00005759int PyUnicode_Compare(PyObject *left,
5760 PyObject *right)
5761{
Guido van Rossum09dc34f2007-05-04 04:17:33 +00005762 if (PyUnicode_Check(left) && PyUnicode_Check(right))
5763 return unicode_compare((PyUnicodeObject *)left,
5764 (PyUnicodeObject *)right);
5765 if ((PyString_Check(left) && PyUnicode_Check(right)) ||
5766 (PyUnicode_Check(left) && PyString_Check(right))) {
5767 if (PyUnicode_Check(left))
5768 left = _PyUnicode_AsDefaultEncodedString(left, NULL);
5769 if (PyUnicode_Check(right))
5770 right = _PyUnicode_AsDefaultEncodedString(right, NULL);
5771 assert(PyString_Check(left));
5772 assert(PyString_Check(right));
5773 return PyObject_Compare(left, right);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005774 }
Guido van Rossum09dc34f2007-05-04 04:17:33 +00005775 PyErr_Format(PyExc_TypeError,
5776 "Can't compare %.100s and %.100s",
5777 left->ob_type->tp_name,
5778 right->ob_type->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005779 return -1;
5780}
5781
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00005782PyObject *PyUnicode_RichCompare(PyObject *left,
5783 PyObject *right,
5784 int op)
5785{
5786 int result;
5787
5788 result = PyUnicode_Compare(left, right);
5789 if (result == -1 && PyErr_Occurred())
5790 goto onError;
5791
5792 /* Convert the return value to a Boolean */
5793 switch (op) {
5794 case Py_EQ:
5795 result = (result == 0);
5796 break;
5797 case Py_NE:
5798 result = (result != 0);
5799 break;
5800 case Py_LE:
5801 result = (result <= 0);
5802 break;
5803 case Py_GE:
5804 result = (result >= 0);
5805 break;
5806 case Py_LT:
5807 result = (result == -1);
5808 break;
5809 case Py_GT:
5810 result = (result == 1);
5811 break;
5812 }
5813 return PyBool_FromLong(result);
5814
5815 onError:
5816
5817 /* Standard case
5818
5819 Type errors mean that PyUnicode_FromObject() could not convert
5820 one of the arguments (usually the right hand side) to Unicode,
5821 ie. we can't handle the comparison request. However, it is
5822 possible that the other object knows a comparison method, which
5823 is why we return Py_NotImplemented to give the other object a
5824 chance.
5825
5826 */
5827 if (PyErr_ExceptionMatches(PyExc_TypeError)) {
5828 PyErr_Clear();
5829 Py_INCREF(Py_NotImplemented);
5830 return Py_NotImplemented;
5831 }
5832 if (op != Py_EQ && op != Py_NE)
5833 return NULL;
5834
5835 /* Equality comparison.
5836
5837 This is a special case: we silence any PyExc_UnicodeDecodeError
5838 and instead turn it into a PyErr_UnicodeWarning.
5839
5840 */
5841 if (!PyErr_ExceptionMatches(PyExc_UnicodeDecodeError))
5842 return NULL;
5843 PyErr_Clear();
5844 if (PyErr_Warn(PyExc_UnicodeWarning,
5845 (op == Py_EQ) ?
5846 "Unicode equal comparison "
5847 "failed to convert both arguments to Unicode - "
5848 "interpreting them as being unequal" :
5849 "Unicode unequal comparison "
5850 "failed to convert both arguments to Unicode - "
5851 "interpreting them as being unequal"
5852 ) < 0)
5853 return NULL;
5854 result = (op == Py_NE);
5855 return PyBool_FromLong(result);
5856}
5857
Guido van Rossum403d68b2000-03-13 15:55:09 +00005858int PyUnicode_Contains(PyObject *container,
5859 PyObject *element)
5860{
Thomas Wouters477c8d52006-05-27 19:21:47 +00005861 PyObject *str, *sub;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005862 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00005863
5864 /* Coerce the two arguments */
Thomas Wouters477c8d52006-05-27 19:21:47 +00005865 sub = PyUnicode_FromObject(element);
5866 if (!sub) {
Marc-André Lemburg7c014682000-06-28 08:11:47 +00005867 PyErr_SetString(PyExc_TypeError,
Barry Warsaw817918c2002-08-06 16:58:21 +00005868 "'in <string>' requires string as left operand");
Thomas Wouters477c8d52006-05-27 19:21:47 +00005869 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +00005870 }
5871
Thomas Wouters477c8d52006-05-27 19:21:47 +00005872 str = PyUnicode_FromObject(container);
5873 if (!str) {
5874 Py_DECREF(sub);
5875 return -1;
5876 }
5877
5878 result = stringlib_contains_obj(str, sub);
5879
5880 Py_DECREF(str);
5881 Py_DECREF(sub);
5882
Guido van Rossum403d68b2000-03-13 15:55:09 +00005883 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00005884}
5885
Guido van Rossumd57fd912000-03-10 22:53:23 +00005886/* Concat to string or Unicode object giving a new Unicode object. */
5887
5888PyObject *PyUnicode_Concat(PyObject *left,
5889 PyObject *right)
5890{
5891 PyUnicodeObject *u = NULL, *v = NULL, *w;
5892
Guido van Rossum84d79dd2007-04-13 02:23:57 +00005893 if (PyBytes_Check(left) || PyBytes_Check(right))
5894 return PyBytes_Concat(left, right);
5895
Guido van Rossumd57fd912000-03-10 22:53:23 +00005896 /* Coerce the two arguments */
5897 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
5898 if (u == NULL)
5899 goto onError;
5900 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
5901 if (v == NULL)
5902 goto onError;
5903
5904 /* Shortcuts */
5905 if (v == unicode_empty) {
5906 Py_DECREF(v);
5907 return (PyObject *)u;
5908 }
5909 if (u == unicode_empty) {
5910 Py_DECREF(u);
5911 return (PyObject *)v;
5912 }
5913
5914 /* Concat the two Unicode strings */
5915 w = _PyUnicode_New(u->length + v->length);
5916 if (w == NULL)
5917 goto onError;
5918 Py_UNICODE_COPY(w->str, u->str, u->length);
5919 Py_UNICODE_COPY(w->str + u->length, v->str, v->length);
5920
5921 Py_DECREF(u);
5922 Py_DECREF(v);
5923 return (PyObject *)w;
5924
5925onError:
5926 Py_XDECREF(u);
5927 Py_XDECREF(v);
5928 return NULL;
5929}
5930
Walter Dörwald1ab83302007-05-18 17:15:44 +00005931void
5932PyUnicode_Append(PyObject **pleft, PyObject *right)
5933{
5934 PyObject *new;
5935 if (*pleft == NULL)
5936 return;
5937 if (right == NULL || !PyUnicode_Check(*pleft)) {
5938 Py_DECREF(*pleft);
5939 *pleft = NULL;
5940 return;
5941 }
5942 new = PyUnicode_Concat(*pleft, right);
5943 Py_DECREF(*pleft);
5944 *pleft = new;
5945}
5946
5947void
5948PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
5949{
5950 PyUnicode_Append(pleft, right);
5951 Py_XDECREF(right);
5952}
5953
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005954PyDoc_STRVAR(count__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005955"S.count(sub[, start[, end]]) -> int\n\
5956\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00005957Return the number of non-overlapping occurrences of substring sub in\n\
5958Unicode string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005959interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005960
5961static PyObject *
5962unicode_count(PyUnicodeObject *self, PyObject *args)
5963{
5964 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005965 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005966 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005967 PyObject *result;
5968
Guido van Rossumb8872e62000-05-09 14:14:27 +00005969 if (!PyArg_ParseTuple(args, "O|O&O&:count", &substring,
5970 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005971 return NULL;
5972
5973 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Thomas Wouters477c8d52006-05-27 19:21:47 +00005974 (PyObject *)substring);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005975 if (substring == NULL)
5976 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +00005977
Thomas Wouters477c8d52006-05-27 19:21:47 +00005978 FIX_START_END(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005979
Thomas Wouters477c8d52006-05-27 19:21:47 +00005980 result = PyInt_FromSsize_t(
5981 stringlib_count(self->str + start, end - start,
5982 substring->str, substring->length)
5983 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00005984
5985 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00005986
Guido van Rossumd57fd912000-03-10 22:53:23 +00005987 return result;
5988}
5989
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005990PyDoc_STRVAR(encode__doc__,
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00005991"S.encode([encoding[,errors]]) -> string or unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005992\n\
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00005993Encodes S using the codec registered for encoding. encoding defaults\n\
5994to the default encoding. errors may be given to set a different error\n\
Fred Drakee4315f52000-05-09 19:53:39 +00005995handling scheme. Default is 'strict' meaning that encoding errors raise\n\
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005996a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
5997'xmlcharrefreplace' as well as any other name registered with\n\
5998codecs.register_error that can handle UnicodeEncodeErrors.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005999
6000static PyObject *
6001unicode_encode(PyUnicodeObject *self, PyObject *args)
6002{
6003 char *encoding = NULL;
6004 char *errors = NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006005 PyObject *v;
6006
Guido van Rossumd57fd912000-03-10 22:53:23 +00006007 if (!PyArg_ParseTuple(args, "|ss:encode", &encoding, &errors))
6008 return NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006009 v = PyUnicode_AsEncodedObject((PyObject *)self, encoding, errors);
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00006010 if (v == NULL)
6011 goto onError;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00006012 if (!PyBytes_Check(v)) {
Guido van Rossum4355a472007-05-04 05:00:04 +00006013 if (PyString_Check(v)) {
6014 /* Old codec, turn it into bytes */
6015 PyObject *b = PyBytes_FromObject(v);
6016 Py_DECREF(v);
6017 return b;
6018 }
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006019 PyErr_Format(PyExc_TypeError,
Guido van Rossumf15a29f2007-05-04 00:41:39 +00006020 "encoder did not return a bytes object "
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006021 "(type=%.400s)",
6022 v->ob_type->tp_name);
6023 Py_DECREF(v);
6024 return NULL;
6025 }
6026 return v;
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00006027
6028 onError:
6029 return NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006030}
6031
6032PyDoc_STRVAR(decode__doc__,
6033"S.decode([encoding[,errors]]) -> string or unicode\n\
6034\n\
6035Decodes S using the codec registered for encoding. encoding defaults\n\
6036to the default encoding. errors may be given to set a different error\n\
6037handling scheme. Default is 'strict' meaning that encoding errors raise\n\
6038a UnicodeDecodeError. Other possible values are 'ignore' and 'replace'\n\
6039as well as any other name registerd with codecs.register_error that is\n\
6040able to handle UnicodeDecodeErrors.");
6041
6042static PyObject *
Marc-André Lemburg126b44c2004-07-10 12:04:20 +00006043unicode_decode(PyUnicodeObject *self, PyObject *args)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006044{
6045 char *encoding = NULL;
6046 char *errors = NULL;
6047 PyObject *v;
6048
6049 if (!PyArg_ParseTuple(args, "|ss:decode", &encoding, &errors))
6050 return NULL;
6051 v = PyUnicode_AsDecodedObject((PyObject *)self, encoding, errors);
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00006052 if (v == NULL)
6053 goto onError;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006054 if (!PyString_Check(v) && !PyUnicode_Check(v)) {
6055 PyErr_Format(PyExc_TypeError,
6056 "decoder did not return a string/unicode object "
6057 "(type=%.400s)",
6058 v->ob_type->tp_name);
6059 Py_DECREF(v);
6060 return NULL;
6061 }
6062 return v;
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00006063
6064 onError:
6065 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006066}
6067
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006068PyDoc_STRVAR(expandtabs__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006069"S.expandtabs([tabsize]) -> unicode\n\
6070\n\
6071Return a copy of S where all tab characters are expanded using spaces.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006072If tabsize is not given, a tab size of 8 characters is assumed.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006073
6074static PyObject*
6075unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
6076{
6077 Py_UNICODE *e;
6078 Py_UNICODE *p;
6079 Py_UNICODE *q;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006080 Py_ssize_t i, j;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006081 PyUnicodeObject *u;
6082 int tabsize = 8;
6083
6084 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
6085 return NULL;
6086
Thomas Wouters7e474022000-07-16 12:04:32 +00006087 /* First pass: determine size of output string */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006088 i = j = 0;
6089 e = self->str + self->length;
6090 for (p = self->str; p < e; p++)
6091 if (*p == '\t') {
6092 if (tabsize > 0)
6093 j += tabsize - (j % tabsize);
6094 }
6095 else {
6096 j++;
6097 if (*p == '\n' || *p == '\r') {
6098 i += j;
6099 j = 0;
6100 }
6101 }
6102
6103 /* Second pass: create output string and fill it */
6104 u = _PyUnicode_New(i + j);
6105 if (!u)
6106 return NULL;
6107
6108 j = 0;
6109 q = u->str;
6110
6111 for (p = self->str; p < e; p++)
6112 if (*p == '\t') {
6113 if (tabsize > 0) {
6114 i = tabsize - (j % tabsize);
6115 j += i;
6116 while (i--)
6117 *q++ = ' ';
6118 }
6119 }
6120 else {
6121 j++;
6122 *q++ = *p;
6123 if (*p == '\n' || *p == '\r')
6124 j = 0;
6125 }
6126
6127 return (PyObject*) u;
6128}
6129
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006130PyDoc_STRVAR(find__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006131"S.find(sub [,start [,end]]) -> int\n\
6132\n\
6133Return the lowest index in S where substring sub is found,\n\
6134such that sub is contained within s[start,end]. Optional\n\
6135arguments start and end are interpreted as in slice notation.\n\
6136\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006137Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006138
6139static PyObject *
6140unicode_find(PyUnicodeObject *self, PyObject *args)
6141{
Thomas Wouters477c8d52006-05-27 19:21:47 +00006142 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006143 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006144 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006145 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006146
Guido van Rossumb8872e62000-05-09 14:14:27 +00006147 if (!PyArg_ParseTuple(args, "O|O&O&:find", &substring,
6148 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006149 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006150 substring = PyUnicode_FromObject(substring);
6151 if (!substring)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006152 return NULL;
6153
Thomas Wouters477c8d52006-05-27 19:21:47 +00006154 result = stringlib_find_slice(
6155 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
6156 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
6157 start, end
6158 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006159
6160 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00006161
6162 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006163}
6164
6165static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +00006166unicode_getitem(PyUnicodeObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006167{
6168 if (index < 0 || index >= self->length) {
6169 PyErr_SetString(PyExc_IndexError, "string index out of range");
6170 return NULL;
6171 }
6172
6173 return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1);
6174}
6175
6176static long
Guido van Rossum09dc34f2007-05-04 04:17:33 +00006177unicode_hash(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006178{
Guido van Rossum09dc34f2007-05-04 04:17:33 +00006179 /* Since Unicode objects compare equal to their UTF-8 string
6180 counterparts, we hash the UTF-8 string. */
6181 PyObject *v = _PyUnicode_AsDefaultEncodedString(self, NULL);
6182 return PyObject_Hash(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006183}
6184
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006185PyDoc_STRVAR(index__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006186"S.index(sub [,start [,end]]) -> int\n\
6187\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006188Like S.find() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006189
6190static PyObject *
6191unicode_index(PyUnicodeObject *self, PyObject *args)
6192{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006193 Py_ssize_t result;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006194 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006195 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006196 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006197
Guido van Rossumb8872e62000-05-09 14:14:27 +00006198 if (!PyArg_ParseTuple(args, "O|O&O&:index", &substring,
6199 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006200 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006201 substring = PyUnicode_FromObject(substring);
6202 if (!substring)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006203 return NULL;
6204
Thomas Wouters477c8d52006-05-27 19:21:47 +00006205 result = stringlib_find_slice(
6206 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
6207 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
6208 start, end
6209 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006210
6211 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00006212
Guido van Rossumd57fd912000-03-10 22:53:23 +00006213 if (result < 0) {
6214 PyErr_SetString(PyExc_ValueError, "substring not found");
6215 return NULL;
6216 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00006217
Martin v. Löwis18e16552006-02-15 17:27:45 +00006218 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006219}
6220
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006221PyDoc_STRVAR(islower__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006222"S.islower() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006223\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00006224Return True if all cased characters in S are lowercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006225at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006226
6227static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006228unicode_islower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006229{
6230 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6231 register const Py_UNICODE *e;
6232 int cased;
6233
Guido van Rossumd57fd912000-03-10 22:53:23 +00006234 /* Shortcut for single character strings */
6235 if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006236 return PyBool_FromLong(Py_UNICODE_ISLOWER(*p));
Guido van Rossumd57fd912000-03-10 22:53:23 +00006237
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006238 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006239 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006240 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006241
Guido van Rossumd57fd912000-03-10 22:53:23 +00006242 e = p + PyUnicode_GET_SIZE(self);
6243 cased = 0;
6244 for (; p < e; p++) {
6245 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00006246
Guido van Rossumd57fd912000-03-10 22:53:23 +00006247 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006248 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006249 else if (!cased && Py_UNICODE_ISLOWER(ch))
6250 cased = 1;
6251 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006252 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006253}
6254
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006255PyDoc_STRVAR(isupper__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006256"S.isupper() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006257\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006258Return True if all cased characters in S are uppercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006259at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006260
6261static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006262unicode_isupper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006263{
6264 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6265 register const Py_UNICODE *e;
6266 int cased;
6267
Guido van Rossumd57fd912000-03-10 22:53:23 +00006268 /* Shortcut for single character strings */
6269 if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006270 return PyBool_FromLong(Py_UNICODE_ISUPPER(*p) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006271
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006272 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006273 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006274 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006275
Guido van Rossumd57fd912000-03-10 22:53:23 +00006276 e = p + PyUnicode_GET_SIZE(self);
6277 cased = 0;
6278 for (; p < e; p++) {
6279 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00006280
Guido van Rossumd57fd912000-03-10 22:53:23 +00006281 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006282 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006283 else if (!cased && Py_UNICODE_ISUPPER(ch))
6284 cased = 1;
6285 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006286 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006287}
6288
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006289PyDoc_STRVAR(istitle__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006290"S.istitle() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006291\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006292Return True if S is a titlecased string and there is at least one\n\
6293character in S, i.e. upper- and titlecase characters may only\n\
6294follow uncased characters and lowercase characters only cased ones.\n\
6295Return False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006296
6297static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006298unicode_istitle(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006299{
6300 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6301 register const Py_UNICODE *e;
6302 int cased, previous_is_cased;
6303
Guido van Rossumd57fd912000-03-10 22:53:23 +00006304 /* Shortcut for single character strings */
6305 if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006306 return PyBool_FromLong((Py_UNICODE_ISTITLE(*p) != 0) ||
6307 (Py_UNICODE_ISUPPER(*p) != 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +00006308
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006309 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006310 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006311 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006312
Guido van Rossumd57fd912000-03-10 22:53:23 +00006313 e = p + PyUnicode_GET_SIZE(self);
6314 cased = 0;
6315 previous_is_cased = 0;
6316 for (; p < e; p++) {
6317 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00006318
Guido van Rossumd57fd912000-03-10 22:53:23 +00006319 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
6320 if (previous_is_cased)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006321 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006322 previous_is_cased = 1;
6323 cased = 1;
6324 }
6325 else if (Py_UNICODE_ISLOWER(ch)) {
6326 if (!previous_is_cased)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006327 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006328 previous_is_cased = 1;
6329 cased = 1;
6330 }
6331 else
6332 previous_is_cased = 0;
6333 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006334 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006335}
6336
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006337PyDoc_STRVAR(isspace__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006338"S.isspace() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006339\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006340Return True if all characters in S are whitespace\n\
6341and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006342
6343static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006344unicode_isspace(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006345{
6346 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6347 register const Py_UNICODE *e;
6348
Guido van Rossumd57fd912000-03-10 22:53:23 +00006349 /* Shortcut for single character strings */
6350 if (PyUnicode_GET_SIZE(self) == 1 &&
6351 Py_UNICODE_ISSPACE(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006352 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006353
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006354 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006355 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006356 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006357
Guido van Rossumd57fd912000-03-10 22:53:23 +00006358 e = p + PyUnicode_GET_SIZE(self);
6359 for (; p < e; p++) {
6360 if (!Py_UNICODE_ISSPACE(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006361 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006362 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006363 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006364}
6365
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006366PyDoc_STRVAR(isalpha__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006367"S.isalpha() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006368\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006369Return True if all characters in S are alphabetic\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006370and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006371
6372static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006373unicode_isalpha(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006374{
6375 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6376 register const Py_UNICODE *e;
6377
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006378 /* Shortcut for single character strings */
6379 if (PyUnicode_GET_SIZE(self) == 1 &&
6380 Py_UNICODE_ISALPHA(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006381 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006382
6383 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006384 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006385 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006386
6387 e = p + PyUnicode_GET_SIZE(self);
6388 for (; p < e; p++) {
6389 if (!Py_UNICODE_ISALPHA(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006390 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006391 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006392 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006393}
6394
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006395PyDoc_STRVAR(isalnum__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006396"S.isalnum() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006397\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006398Return True if all characters in S are alphanumeric\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006399and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006400
6401static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006402unicode_isalnum(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006403{
6404 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6405 register const Py_UNICODE *e;
6406
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006407 /* Shortcut for single character strings */
6408 if (PyUnicode_GET_SIZE(self) == 1 &&
6409 Py_UNICODE_ISALNUM(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006410 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006411
6412 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006413 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006414 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006415
6416 e = p + PyUnicode_GET_SIZE(self);
6417 for (; p < e; p++) {
6418 if (!Py_UNICODE_ISALNUM(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006419 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006420 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006421 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006422}
6423
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006424PyDoc_STRVAR(isdecimal__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006425"S.isdecimal() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006426\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00006427Return True if there are only decimal characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006428False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006429
6430static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006431unicode_isdecimal(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006432{
6433 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6434 register const Py_UNICODE *e;
6435
Guido van Rossumd57fd912000-03-10 22:53:23 +00006436 /* Shortcut for single character strings */
6437 if (PyUnicode_GET_SIZE(self) == 1 &&
6438 Py_UNICODE_ISDECIMAL(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006439 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006440
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006441 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006442 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006443 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006444
Guido van Rossumd57fd912000-03-10 22:53:23 +00006445 e = p + PyUnicode_GET_SIZE(self);
6446 for (; p < e; p++) {
6447 if (!Py_UNICODE_ISDECIMAL(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006448 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006449 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006450 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006451}
6452
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006453PyDoc_STRVAR(isdigit__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006454"S.isdigit() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006455\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006456Return True if all characters in S are digits\n\
6457and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006458
6459static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006460unicode_isdigit(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006461{
6462 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6463 register const Py_UNICODE *e;
6464
Guido van Rossumd57fd912000-03-10 22:53:23 +00006465 /* Shortcut for single character strings */
6466 if (PyUnicode_GET_SIZE(self) == 1 &&
6467 Py_UNICODE_ISDIGIT(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006468 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006469
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006470 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006471 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006472 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006473
Guido van Rossumd57fd912000-03-10 22:53:23 +00006474 e = p + PyUnicode_GET_SIZE(self);
6475 for (; p < e; p++) {
6476 if (!Py_UNICODE_ISDIGIT(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006477 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006478 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006479 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006480}
6481
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006482PyDoc_STRVAR(isnumeric__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006483"S.isnumeric() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006484\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00006485Return True if there are only numeric characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006486False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006487
6488static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006489unicode_isnumeric(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006490{
6491 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6492 register const Py_UNICODE *e;
6493
Guido van Rossumd57fd912000-03-10 22:53:23 +00006494 /* Shortcut for single character strings */
6495 if (PyUnicode_GET_SIZE(self) == 1 &&
6496 Py_UNICODE_ISNUMERIC(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006497 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006498
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006499 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006500 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006501 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006502
Guido van Rossumd57fd912000-03-10 22:53:23 +00006503 e = p + PyUnicode_GET_SIZE(self);
6504 for (; p < e; p++) {
6505 if (!Py_UNICODE_ISNUMERIC(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006506 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006507 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006508 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006509}
6510
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006511PyDoc_STRVAR(join__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006512"S.join(sequence) -> unicode\n\
6513\n\
6514Return a string which is the concatenation of the strings in the\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006515sequence. The separator between elements is S.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006516
6517static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006518unicode_join(PyObject *self, PyObject *data)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006519{
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006520 return PyUnicode_Join(self, data);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006521}
6522
Martin v. Löwis18e16552006-02-15 17:27:45 +00006523static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +00006524unicode_length(PyUnicodeObject *self)
6525{
6526 return self->length;
6527}
6528
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006529PyDoc_STRVAR(ljust__doc__,
Hye-Shik Chang974ed7c2004-06-02 16:49:17 +00006530"S.ljust(width[, fillchar]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006531\n\
6532Return S left justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006533done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006534
6535static PyObject *
6536unicode_ljust(PyUnicodeObject *self, PyObject *args)
6537{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006538 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006539 Py_UNICODE fillchar = ' ';
6540
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006541 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006542 return NULL;
6543
Tim Peters7a29bd52001-09-12 03:03:31 +00006544 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006545 Py_INCREF(self);
6546 return (PyObject*) self;
6547 }
6548
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006549 return (PyObject*) pad(self, 0, width - self->length, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006550}
6551
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006552PyDoc_STRVAR(lower__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006553"S.lower() -> unicode\n\
6554\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006555Return a copy of the string S converted to lowercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006556
6557static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006558unicode_lower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006559{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006560 return fixup(self, fixlower);
6561}
6562
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006563#define LEFTSTRIP 0
6564#define RIGHTSTRIP 1
6565#define BOTHSTRIP 2
6566
6567/* Arrays indexed by above */
6568static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
6569
6570#define STRIPNAME(i) (stripformat[i]+3)
6571
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006572/* externally visible for str.strip(unicode) */
6573PyObject *
6574_PyUnicode_XStrip(PyUnicodeObject *self, int striptype, PyObject *sepobj)
6575{
6576 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
Martin v. Löwis18e16552006-02-15 17:27:45 +00006577 Py_ssize_t len = PyUnicode_GET_SIZE(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006578 Py_UNICODE *sep = PyUnicode_AS_UNICODE(sepobj);
Martin v. Löwis18e16552006-02-15 17:27:45 +00006579 Py_ssize_t seplen = PyUnicode_GET_SIZE(sepobj);
6580 Py_ssize_t i, j;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006581
Thomas Wouters477c8d52006-05-27 19:21:47 +00006582 BLOOM_MASK sepmask = make_bloom_mask(sep, seplen);
6583
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006584 i = 0;
6585 if (striptype != RIGHTSTRIP) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00006586 while (i < len && BLOOM_MEMBER(sepmask, s[i], sep, seplen)) {
6587 i++;
6588 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006589 }
6590
6591 j = len;
6592 if (striptype != LEFTSTRIP) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00006593 do {
6594 j--;
6595 } while (j >= i && BLOOM_MEMBER(sepmask, s[j], sep, seplen));
6596 j++;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006597 }
6598
6599 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00006600 Py_INCREF(self);
6601 return (PyObject*)self;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006602 }
6603 else
Thomas Wouters477c8d52006-05-27 19:21:47 +00006604 return PyUnicode_FromUnicode(s+i, j-i);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006605}
6606
Guido van Rossumd57fd912000-03-10 22:53:23 +00006607
6608static PyObject *
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006609do_strip(PyUnicodeObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006610{
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006611 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
Martin v. Löwis18e16552006-02-15 17:27:45 +00006612 Py_ssize_t len = PyUnicode_GET_SIZE(self), i, j;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006613
6614 i = 0;
6615 if (striptype != RIGHTSTRIP) {
6616 while (i < len && Py_UNICODE_ISSPACE(s[i])) {
6617 i++;
6618 }
6619 }
6620
6621 j = len;
6622 if (striptype != LEFTSTRIP) {
6623 do {
6624 j--;
6625 } while (j >= i && Py_UNICODE_ISSPACE(s[j]));
6626 j++;
6627 }
6628
6629 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
6630 Py_INCREF(self);
6631 return (PyObject*)self;
6632 }
6633 else
6634 return PyUnicode_FromUnicode(s+i, j-i);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006635}
6636
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006637
6638static PyObject *
6639do_argstrip(PyUnicodeObject *self, int striptype, PyObject *args)
6640{
6641 PyObject *sep = NULL;
6642
6643 if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
6644 return NULL;
6645
6646 if (sep != NULL && sep != Py_None) {
6647 if (PyUnicode_Check(sep))
6648 return _PyUnicode_XStrip(self, striptype, sep);
6649 else if (PyString_Check(sep)) {
6650 PyObject *res;
6651 sep = PyUnicode_FromObject(sep);
6652 if (sep==NULL)
6653 return NULL;
6654 res = _PyUnicode_XStrip(self, striptype, sep);
6655 Py_DECREF(sep);
6656 return res;
6657 }
6658 else {
6659 PyErr_Format(PyExc_TypeError,
6660 "%s arg must be None, unicode or str",
6661 STRIPNAME(striptype));
6662 return NULL;
6663 }
6664 }
6665
6666 return do_strip(self, striptype);
6667}
6668
6669
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006670PyDoc_STRVAR(strip__doc__,
Neal Norwitzffe33b72003-04-10 22:35:32 +00006671"S.strip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006672\n\
6673Return a copy of the string S with leading and trailing\n\
6674whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00006675If chars is given and not None, remove characters in chars instead.\n\
6676If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006677
6678static PyObject *
6679unicode_strip(PyUnicodeObject *self, PyObject *args)
6680{
6681 if (PyTuple_GET_SIZE(args) == 0)
6682 return do_strip(self, BOTHSTRIP); /* Common case */
6683 else
6684 return do_argstrip(self, BOTHSTRIP, args);
6685}
6686
6687
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006688PyDoc_STRVAR(lstrip__doc__,
Neal Norwitzffe33b72003-04-10 22:35:32 +00006689"S.lstrip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006690\n\
6691Return a copy of the string S with leading whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00006692If chars is given and not None, remove characters in chars instead.\n\
6693If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006694
6695static PyObject *
6696unicode_lstrip(PyUnicodeObject *self, PyObject *args)
6697{
6698 if (PyTuple_GET_SIZE(args) == 0)
6699 return do_strip(self, LEFTSTRIP); /* Common case */
6700 else
6701 return do_argstrip(self, LEFTSTRIP, args);
6702}
6703
6704
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006705PyDoc_STRVAR(rstrip__doc__,
Neal Norwitzffe33b72003-04-10 22:35:32 +00006706"S.rstrip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006707\n\
6708Return a copy of the string S with trailing whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00006709If chars is given and not None, remove characters in chars instead.\n\
6710If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006711
6712static PyObject *
6713unicode_rstrip(PyUnicodeObject *self, PyObject *args)
6714{
6715 if (PyTuple_GET_SIZE(args) == 0)
6716 return do_strip(self, RIGHTSTRIP); /* Common case */
6717 else
6718 return do_argstrip(self, RIGHTSTRIP, args);
6719}
6720
6721
Guido van Rossumd57fd912000-03-10 22:53:23 +00006722static PyObject*
Martin v. Löwis18e16552006-02-15 17:27:45 +00006723unicode_repeat(PyUnicodeObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006724{
6725 PyUnicodeObject *u;
6726 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006727 Py_ssize_t nchars;
Tim Peters8f422462000-09-09 06:13:41 +00006728 size_t nbytes;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006729
6730 if (len < 0)
6731 len = 0;
6732
Tim Peters7a29bd52001-09-12 03:03:31 +00006733 if (len == 1 && PyUnicode_CheckExact(str)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006734 /* no repeat, return original string */
6735 Py_INCREF(str);
6736 return (PyObject*) str;
6737 }
Tim Peters8f422462000-09-09 06:13:41 +00006738
6739 /* ensure # of chars needed doesn't overflow int and # of bytes
6740 * needed doesn't overflow size_t
6741 */
6742 nchars = len * str->length;
6743 if (len && nchars / len != str->length) {
6744 PyErr_SetString(PyExc_OverflowError,
6745 "repeated string is too long");
6746 return NULL;
6747 }
6748 nbytes = (nchars + 1) * sizeof(Py_UNICODE);
6749 if (nbytes / sizeof(Py_UNICODE) != (size_t)(nchars + 1)) {
6750 PyErr_SetString(PyExc_OverflowError,
6751 "repeated string is too long");
6752 return NULL;
6753 }
6754 u = _PyUnicode_New(nchars);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006755 if (!u)
6756 return NULL;
6757
6758 p = u->str;
6759
Thomas Wouters477c8d52006-05-27 19:21:47 +00006760 if (str->length == 1 && len > 0) {
6761 Py_UNICODE_FILL(p, str->str[0], len);
6762 } else {
6763 Py_ssize_t done = 0; /* number of characters copied this far */
6764 if (done < nchars) {
6765 Py_UNICODE_COPY(p, str->str, str->length);
6766 done = str->length;
6767 }
6768 while (done < nchars) {
6769 int n = (done <= nchars-done) ? done : nchars-done;
6770 Py_UNICODE_COPY(p+done, p, n);
6771 done += n;
6772 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006773 }
6774
6775 return (PyObject*) u;
6776}
6777
6778PyObject *PyUnicode_Replace(PyObject *obj,
6779 PyObject *subobj,
6780 PyObject *replobj,
Martin v. Löwis18e16552006-02-15 17:27:45 +00006781 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006782{
6783 PyObject *self;
6784 PyObject *str1;
6785 PyObject *str2;
6786 PyObject *result;
6787
6788 self = PyUnicode_FromObject(obj);
6789 if (self == NULL)
6790 return NULL;
6791 str1 = PyUnicode_FromObject(subobj);
6792 if (str1 == NULL) {
6793 Py_DECREF(self);
6794 return NULL;
6795 }
6796 str2 = PyUnicode_FromObject(replobj);
6797 if (str2 == NULL) {
6798 Py_DECREF(self);
6799 Py_DECREF(str1);
6800 return NULL;
6801 }
Tim Petersced69f82003-09-16 20:30:58 +00006802 result = replace((PyUnicodeObject *)self,
6803 (PyUnicodeObject *)str1,
6804 (PyUnicodeObject *)str2,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006805 maxcount);
6806 Py_DECREF(self);
6807 Py_DECREF(str1);
6808 Py_DECREF(str2);
6809 return result;
6810}
6811
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006812PyDoc_STRVAR(replace__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006813"S.replace (old, new[, maxsplit]) -> unicode\n\
6814\n\
6815Return a copy of S with all occurrences of substring\n\
6816old replaced by new. If the optional argument maxsplit is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006817given, only the first maxsplit occurrences are replaced.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006818
6819static PyObject*
6820unicode_replace(PyUnicodeObject *self, PyObject *args)
6821{
6822 PyUnicodeObject *str1;
6823 PyUnicodeObject *str2;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006824 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006825 PyObject *result;
6826
Martin v. Löwis18e16552006-02-15 17:27:45 +00006827 if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006828 return NULL;
6829 str1 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str1);
6830 if (str1 == NULL)
6831 return NULL;
6832 str2 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str2);
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00006833 if (str2 == NULL) {
6834 Py_DECREF(str1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006835 return NULL;
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00006836 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006837
6838 result = replace(self, str1, str2, maxcount);
6839
6840 Py_DECREF(str1);
6841 Py_DECREF(str2);
6842 return result;
6843}
6844
6845static
6846PyObject *unicode_repr(PyObject *unicode)
6847{
Walter Dörwald79e913e2007-05-12 11:08:06 +00006848 PyObject *repr;
Walter Dörwald1ab83302007-05-18 17:15:44 +00006849 Py_UNICODE *p;
Walter Dörwald79e913e2007-05-12 11:08:06 +00006850 Py_UNICODE *s = PyUnicode_AS_UNICODE(unicode);
6851 Py_ssize_t size = PyUnicode_GET_SIZE(unicode);
6852
6853 /* XXX(nnorwitz): rather than over-allocating, it would be
6854 better to choose a different scheme. Perhaps scan the
6855 first N-chars of the string and allocate based on that size.
6856 */
6857 /* Initial allocation is based on the longest-possible unichr
6858 escape.
6859
6860 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
6861 unichr, so in this case it's the longest unichr escape. In
6862 narrow (UTF-16) builds this is five chars per source unichr
6863 since there are two unichrs in the surrogate pair, so in narrow
6864 (UTF-16) builds it's not the longest unichr escape.
6865
6866 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
6867 so in the narrow (UTF-16) build case it's the longest unichr
6868 escape.
6869 */
6870
Walter Dörwald1ab83302007-05-18 17:15:44 +00006871 repr = PyUnicode_FromUnicode(NULL,
Walter Dörwald79e913e2007-05-12 11:08:06 +00006872 2 /* quotes */
6873#ifdef Py_UNICODE_WIDE
6874 + 10*size
6875#else
6876 + 6*size
6877#endif
6878 + 1);
6879 if (repr == NULL)
6880 return NULL;
6881
Walter Dörwald1ab83302007-05-18 17:15:44 +00006882 p = PyUnicode_AS_UNICODE(repr);
Walter Dörwald79e913e2007-05-12 11:08:06 +00006883
6884 /* Add quote */
6885 *p++ = (findchar(s, size, '\'') &&
6886 !findchar(s, size, '"')) ? '"' : '\'';
6887 while (size-- > 0) {
6888 Py_UNICODE ch = *s++;
6889
6890 /* Escape quotes and backslashes */
Walter Dörwald1ab83302007-05-18 17:15:44 +00006891 if ((ch == PyUnicode_AS_UNICODE(repr)[0]) || (ch == '\\')) {
Walter Dörwald79e913e2007-05-12 11:08:06 +00006892 *p++ = '\\';
Walter Dörwald1ab83302007-05-18 17:15:44 +00006893 *p++ = ch;
Walter Dörwald79e913e2007-05-12 11:08:06 +00006894 continue;
6895 }
6896
6897#ifdef Py_UNICODE_WIDE
6898 /* Map 21-bit characters to '\U00xxxxxx' */
6899 else if (ch >= 0x10000) {
6900 *p++ = '\\';
6901 *p++ = 'U';
6902 *p++ = hexdigits[(ch >> 28) & 0x0000000F];
6903 *p++ = hexdigits[(ch >> 24) & 0x0000000F];
6904 *p++ = hexdigits[(ch >> 20) & 0x0000000F];
6905 *p++ = hexdigits[(ch >> 16) & 0x0000000F];
6906 *p++ = hexdigits[(ch >> 12) & 0x0000000F];
6907 *p++ = hexdigits[(ch >> 8) & 0x0000000F];
6908 *p++ = hexdigits[(ch >> 4) & 0x0000000F];
6909 *p++ = hexdigits[ch & 0x0000000F];
6910 continue;
6911 }
6912#else
6913 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
6914 else if (ch >= 0xD800 && ch < 0xDC00) {
6915 Py_UNICODE ch2;
6916 Py_UCS4 ucs;
6917
6918 ch2 = *s++;
6919 size--;
6920 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
6921 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
6922 *p++ = '\\';
6923 *p++ = 'U';
6924 *p++ = hexdigits[(ucs >> 28) & 0x0000000F];
6925 *p++ = hexdigits[(ucs >> 24) & 0x0000000F];
6926 *p++ = hexdigits[(ucs >> 20) & 0x0000000F];
6927 *p++ = hexdigits[(ucs >> 16) & 0x0000000F];
6928 *p++ = hexdigits[(ucs >> 12) & 0x0000000F];
6929 *p++ = hexdigits[(ucs >> 8) & 0x0000000F];
6930 *p++ = hexdigits[(ucs >> 4) & 0x0000000F];
6931 *p++ = hexdigits[ucs & 0x0000000F];
6932 continue;
6933 }
6934 /* Fall through: isolated surrogates are copied as-is */
6935 s--;
6936 size++;
6937 }
6938#endif
6939
6940 /* Map 16-bit characters to '\uxxxx' */
6941 if (ch >= 256) {
6942 *p++ = '\\';
6943 *p++ = 'u';
6944 *p++ = hexdigits[(ch >> 12) & 0x000F];
6945 *p++ = hexdigits[(ch >> 8) & 0x000F];
6946 *p++ = hexdigits[(ch >> 4) & 0x000F];
6947 *p++ = hexdigits[ch & 0x000F];
6948 }
6949
6950 /* Map special whitespace to '\t', \n', '\r' */
6951 else if (ch == '\t') {
6952 *p++ = '\\';
6953 *p++ = 't';
6954 }
6955 else if (ch == '\n') {
6956 *p++ = '\\';
6957 *p++ = 'n';
6958 }
6959 else if (ch == '\r') {
6960 *p++ = '\\';
6961 *p++ = 'r';
6962 }
6963
6964 /* Map non-printable US ASCII to '\xhh' */
6965 else if (ch < ' ' || ch >= 0x7F) {
6966 *p++ = '\\';
6967 *p++ = 'x';
6968 *p++ = hexdigits[(ch >> 4) & 0x000F];
6969 *p++ = hexdigits[ch & 0x000F];
6970 }
6971
6972 /* Copy everything else as-is */
6973 else
6974 *p++ = (char) ch;
6975 }
6976 /* Add quote */
Walter Dörwald1ab83302007-05-18 17:15:44 +00006977 *p++ = PyUnicode_AS_UNICODE(repr)[0];
Walter Dörwald79e913e2007-05-12 11:08:06 +00006978
6979 *p = '\0';
Walter Dörwald1ab83302007-05-18 17:15:44 +00006980 _PyUnicode_Resize(&repr, p - PyUnicode_AS_UNICODE(repr));
Walter Dörwald79e913e2007-05-12 11:08:06 +00006981 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006982}
6983
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006984PyDoc_STRVAR(rfind__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006985"S.rfind(sub [,start [,end]]) -> int\n\
6986\n\
6987Return the highest index in S where substring sub is found,\n\
6988such that sub is contained within s[start,end]. Optional\n\
6989arguments start and end are interpreted as in slice notation.\n\
6990\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006991Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006992
6993static PyObject *
6994unicode_rfind(PyUnicodeObject *self, PyObject *args)
6995{
Thomas Wouters477c8d52006-05-27 19:21:47 +00006996 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006997 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006998 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006999 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007000
Guido van Rossumb8872e62000-05-09 14:14:27 +00007001 if (!PyArg_ParseTuple(args, "O|O&O&:rfind", &substring,
7002 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007003 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007004 substring = PyUnicode_FromObject(substring);
7005 if (!substring)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007006 return NULL;
7007
Thomas Wouters477c8d52006-05-27 19:21:47 +00007008 result = stringlib_rfind_slice(
7009 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
7010 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
7011 start, end
7012 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007013
7014 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00007015
7016 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007017}
7018
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007019PyDoc_STRVAR(rindex__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007020"S.rindex(sub [,start [,end]]) -> int\n\
7021\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007022Like S.rfind() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007023
7024static PyObject *
7025unicode_rindex(PyUnicodeObject *self, PyObject *args)
7026{
Thomas Wouters477c8d52006-05-27 19:21:47 +00007027 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007028 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007029 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007030 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007031
Guido van Rossumb8872e62000-05-09 14:14:27 +00007032 if (!PyArg_ParseTuple(args, "O|O&O&:rindex", &substring,
7033 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007034 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007035 substring = PyUnicode_FromObject(substring);
7036 if (!substring)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007037 return NULL;
7038
Thomas Wouters477c8d52006-05-27 19:21:47 +00007039 result = stringlib_rfind_slice(
7040 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
7041 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
7042 start, end
7043 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007044
7045 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00007046
Guido van Rossumd57fd912000-03-10 22:53:23 +00007047 if (result < 0) {
7048 PyErr_SetString(PyExc_ValueError, "substring not found");
7049 return NULL;
7050 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00007051 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007052}
7053
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007054PyDoc_STRVAR(rjust__doc__,
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007055"S.rjust(width[, fillchar]) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007056\n\
7057Return S right justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007058done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007059
7060static PyObject *
7061unicode_rjust(PyUnicodeObject *self, PyObject *args)
7062{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007063 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007064 Py_UNICODE fillchar = ' ';
7065
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007066 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007067 return NULL;
7068
Tim Peters7a29bd52001-09-12 03:03:31 +00007069 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007070 Py_INCREF(self);
7071 return (PyObject*) self;
7072 }
7073
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007074 return (PyObject*) pad(self, width - self->length, 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007075}
7076
Guido van Rossumd57fd912000-03-10 22:53:23 +00007077static PyObject*
Martin v. Löwis18e16552006-02-15 17:27:45 +00007078unicode_slice(PyUnicodeObject *self, Py_ssize_t start, Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007079{
7080 /* standard clamping */
7081 if (start < 0)
7082 start = 0;
7083 if (end < 0)
7084 end = 0;
7085 if (end > self->length)
7086 end = self->length;
Tim Peters7a29bd52001-09-12 03:03:31 +00007087 if (start == 0 && end == self->length && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007088 /* full slice, return original string */
7089 Py_INCREF(self);
7090 return (PyObject*) self;
7091 }
7092 if (start > end)
7093 start = end;
7094 /* copy slice */
7095 return (PyObject*) PyUnicode_FromUnicode(self->str + start,
7096 end - start);
7097}
7098
7099PyObject *PyUnicode_Split(PyObject *s,
7100 PyObject *sep,
Martin v. Löwis18e16552006-02-15 17:27:45 +00007101 Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007102{
7103 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00007104
Guido van Rossumd57fd912000-03-10 22:53:23 +00007105 s = PyUnicode_FromObject(s);
7106 if (s == NULL)
7107 return NULL;
7108 if (sep != NULL) {
7109 sep = PyUnicode_FromObject(sep);
7110 if (sep == NULL) {
7111 Py_DECREF(s);
7112 return NULL;
7113 }
7114 }
7115
7116 result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
7117
7118 Py_DECREF(s);
7119 Py_XDECREF(sep);
7120 return result;
7121}
7122
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007123PyDoc_STRVAR(split__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007124"S.split([sep [,maxsplit]]) -> list of strings\n\
7125\n\
7126Return a list of the words in S, using sep as the\n\
7127delimiter string. If maxsplit is given, at most maxsplit\n\
Thomas Hellerca0d2cb2004-09-15 11:41:32 +00007128splits are done. If sep is not specified or is None,\n\
Walter Dörwald782afc52004-09-14 09:40:45 +00007129any whitespace string is a separator.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007130
7131static PyObject*
7132unicode_split(PyUnicodeObject *self, PyObject *args)
7133{
7134 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007135 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007136
Martin v. Löwis18e16552006-02-15 17:27:45 +00007137 if (!PyArg_ParseTuple(args, "|On:split", &substring, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007138 return NULL;
7139
7140 if (substring == Py_None)
7141 return split(self, NULL, maxcount);
7142 else if (PyUnicode_Check(substring))
7143 return split(self, (PyUnicodeObject *)substring, maxcount);
7144 else
7145 return PyUnicode_Split((PyObject *)self, substring, maxcount);
7146}
7147
Thomas Wouters477c8d52006-05-27 19:21:47 +00007148PyObject *
7149PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
7150{
7151 PyObject* str_obj;
7152 PyObject* sep_obj;
7153 PyObject* out;
7154
7155 str_obj = PyUnicode_FromObject(str_in);
7156 if (!str_obj)
7157 return NULL;
7158 sep_obj = PyUnicode_FromObject(sep_in);
7159 if (!sep_obj) {
7160 Py_DECREF(str_obj);
7161 return NULL;
7162 }
7163
7164 out = stringlib_partition(
7165 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
7166 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
7167 );
7168
7169 Py_DECREF(sep_obj);
7170 Py_DECREF(str_obj);
7171
7172 return out;
7173}
7174
7175
7176PyObject *
7177PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
7178{
7179 PyObject* str_obj;
7180 PyObject* sep_obj;
7181 PyObject* out;
7182
7183 str_obj = PyUnicode_FromObject(str_in);
7184 if (!str_obj)
7185 return NULL;
7186 sep_obj = PyUnicode_FromObject(sep_in);
7187 if (!sep_obj) {
7188 Py_DECREF(str_obj);
7189 return NULL;
7190 }
7191
7192 out = stringlib_rpartition(
7193 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
7194 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
7195 );
7196
7197 Py_DECREF(sep_obj);
7198 Py_DECREF(str_obj);
7199
7200 return out;
7201}
7202
7203PyDoc_STRVAR(partition__doc__,
7204"S.partition(sep) -> (head, sep, tail)\n\
7205\n\
7206Searches for the separator sep in S, and returns the part before it,\n\
7207the separator itself, and the part after it. If the separator is not\n\
7208found, returns S and two empty strings.");
7209
7210static PyObject*
7211unicode_partition(PyUnicodeObject *self, PyObject *separator)
7212{
7213 return PyUnicode_Partition((PyObject *)self, separator);
7214}
7215
7216PyDoc_STRVAR(rpartition__doc__,
Thomas Wouters89f507f2006-12-13 04:49:30 +00007217"S.rpartition(sep) -> (tail, sep, head)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00007218\n\
7219Searches for the separator sep in S, starting at the end of S, and returns\n\
7220the part before it, the separator itself, and the part after it. If the\n\
Thomas Wouters89f507f2006-12-13 04:49:30 +00007221separator is not found, returns two empty strings and S.");
Thomas Wouters477c8d52006-05-27 19:21:47 +00007222
7223static PyObject*
7224unicode_rpartition(PyUnicodeObject *self, PyObject *separator)
7225{
7226 return PyUnicode_RPartition((PyObject *)self, separator);
7227}
7228
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007229PyObject *PyUnicode_RSplit(PyObject *s,
7230 PyObject *sep,
Martin v. Löwis18e16552006-02-15 17:27:45 +00007231 Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007232{
7233 PyObject *result;
7234
7235 s = PyUnicode_FromObject(s);
7236 if (s == NULL)
7237 return NULL;
7238 if (sep != NULL) {
7239 sep = PyUnicode_FromObject(sep);
7240 if (sep == NULL) {
7241 Py_DECREF(s);
7242 return NULL;
7243 }
7244 }
7245
7246 result = rsplit((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
7247
7248 Py_DECREF(s);
7249 Py_XDECREF(sep);
7250 return result;
7251}
7252
7253PyDoc_STRVAR(rsplit__doc__,
7254"S.rsplit([sep [,maxsplit]]) -> list of strings\n\
7255\n\
7256Return a list of the words in S, using sep as the\n\
7257delimiter string, starting at the end of the string and\n\
7258working to the front. If maxsplit is given, at most maxsplit\n\
7259splits are done. If sep is not specified, any whitespace string\n\
7260is a separator.");
7261
7262static PyObject*
7263unicode_rsplit(PyUnicodeObject *self, PyObject *args)
7264{
7265 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007266 Py_ssize_t maxcount = -1;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007267
Martin v. Löwis18e16552006-02-15 17:27:45 +00007268 if (!PyArg_ParseTuple(args, "|On:rsplit", &substring, &maxcount))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007269 return NULL;
7270
7271 if (substring == Py_None)
7272 return rsplit(self, NULL, maxcount);
7273 else if (PyUnicode_Check(substring))
7274 return rsplit(self, (PyUnicodeObject *)substring, maxcount);
7275 else
7276 return PyUnicode_RSplit((PyObject *)self, substring, maxcount);
7277}
7278
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007279PyDoc_STRVAR(splitlines__doc__,
Guido van Rossum86662912000-04-11 15:38:46 +00007280"S.splitlines([keepends]]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007281\n\
7282Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +00007283Line breaks are not included in the resulting list unless keepends\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007284is given and true.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007285
7286static PyObject*
7287unicode_splitlines(PyUnicodeObject *self, PyObject *args)
7288{
Guido van Rossum86662912000-04-11 15:38:46 +00007289 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007290
Guido van Rossum86662912000-04-11 15:38:46 +00007291 if (!PyArg_ParseTuple(args, "|i:splitlines", &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007292 return NULL;
7293
Guido van Rossum86662912000-04-11 15:38:46 +00007294 return PyUnicode_Splitlines((PyObject *)self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007295}
7296
7297static
Guido van Rossumf15a29f2007-05-04 00:41:39 +00007298PyObject *unicode_str(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007299{
Guido van Rossumf15a29f2007-05-04 00:41:39 +00007300 PyObject *res = _PyUnicode_AsDefaultEncodedString(self, NULL);
7301 Py_XINCREF(res);
7302 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007303}
7304
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007305PyDoc_STRVAR(swapcase__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007306"S.swapcase() -> unicode\n\
7307\n\
7308Return a copy of S with uppercase characters converted to lowercase\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007309and vice versa.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007310
7311static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007312unicode_swapcase(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007313{
Guido van Rossumd57fd912000-03-10 22:53:23 +00007314 return fixup(self, fixswapcase);
7315}
7316
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007317PyDoc_STRVAR(translate__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007318"S.translate(table) -> unicode\n\
7319\n\
7320Return a copy of the string S, where all characters have been mapped\n\
7321through the given translation table, which must be a mapping of\n\
Walter Dörwald5c1ee172002-09-04 20:31:32 +00007322Unicode ordinals to Unicode ordinals, Unicode strings or None.\n\
7323Unmapped characters are left untouched. Characters mapped to None\n\
7324are deleted.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007325
7326static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007327unicode_translate(PyUnicodeObject *self, PyObject *table)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007328{
Tim Petersced69f82003-09-16 20:30:58 +00007329 return PyUnicode_TranslateCharmap(self->str,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007330 self->length,
Tim Petersced69f82003-09-16 20:30:58 +00007331 table,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007332 "ignore");
7333}
7334
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007335PyDoc_STRVAR(upper__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007336"S.upper() -> unicode\n\
7337\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007338Return a copy of S converted to uppercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007339
7340static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007341unicode_upper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007342{
Guido van Rossumd57fd912000-03-10 22:53:23 +00007343 return fixup(self, fixupper);
7344}
7345
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007346PyDoc_STRVAR(zfill__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007347"S.zfill(width) -> unicode\n\
7348\n\
7349Pad a numeric string x with zeros on the left, to fill a field\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007350of the specified width. The string x is never truncated.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007351
7352static PyObject *
7353unicode_zfill(PyUnicodeObject *self, PyObject *args)
7354{
Martin v. Löwis18e16552006-02-15 17:27:45 +00007355 Py_ssize_t fill;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007356 PyUnicodeObject *u;
7357
Martin v. Löwis18e16552006-02-15 17:27:45 +00007358 Py_ssize_t width;
7359 if (!PyArg_ParseTuple(args, "n:zfill", &width))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007360 return NULL;
7361
7362 if (self->length >= width) {
Walter Dörwald0fe940c2002-04-15 18:42:15 +00007363 if (PyUnicode_CheckExact(self)) {
7364 Py_INCREF(self);
7365 return (PyObject*) self;
7366 }
7367 else
7368 return PyUnicode_FromUnicode(
7369 PyUnicode_AS_UNICODE(self),
7370 PyUnicode_GET_SIZE(self)
7371 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007372 }
7373
7374 fill = width - self->length;
7375
7376 u = pad(self, fill, 0, '0');
7377
Walter Dörwald068325e2002-04-15 13:36:47 +00007378 if (u == NULL)
7379 return NULL;
7380
Guido van Rossumd57fd912000-03-10 22:53:23 +00007381 if (u->str[fill] == '+' || u->str[fill] == '-') {
7382 /* move sign to beginning of string */
7383 u->str[0] = u->str[fill];
7384 u->str[fill] = '0';
7385 }
7386
7387 return (PyObject*) u;
7388}
Guido van Rossumd57fd912000-03-10 22:53:23 +00007389
7390#if 0
7391static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007392unicode_freelistsize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007393{
Guido van Rossumd57fd912000-03-10 22:53:23 +00007394 return PyInt_FromLong(unicode_freelist_size);
7395}
7396#endif
7397
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007398PyDoc_STRVAR(startswith__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00007399"S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007400\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00007401Return True if S starts with the specified prefix, False otherwise.\n\
7402With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007403With optional end, stop comparing S at that position.\n\
7404prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007405
7406static PyObject *
7407unicode_startswith(PyUnicodeObject *self,
7408 PyObject *args)
7409{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007410 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007411 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007412 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007413 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007414 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007415
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007416 if (!PyArg_ParseTuple(args, "O|O&O&:startswith", &subobj,
Guido van Rossumb8872e62000-05-09 14:14:27 +00007417 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007418 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007419 if (PyTuple_Check(subobj)) {
7420 Py_ssize_t i;
7421 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
7422 substring = (PyUnicodeObject *)PyUnicode_FromObject(
7423 PyTuple_GET_ITEM(subobj, i));
7424 if (substring == NULL)
7425 return NULL;
7426 result = tailmatch(self, substring, start, end, -1);
7427 Py_DECREF(substring);
7428 if (result) {
7429 Py_RETURN_TRUE;
7430 }
7431 }
7432 /* nothing matched */
7433 Py_RETURN_FALSE;
7434 }
7435 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007436 if (substring == NULL)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007437 return NULL;
7438 result = tailmatch(self, substring, start, end, -1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007439 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007440 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007441}
7442
7443
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007444PyDoc_STRVAR(endswith__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00007445"S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007446\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00007447Return True if S ends with the specified suffix, False otherwise.\n\
7448With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007449With optional end, stop comparing S at that position.\n\
7450suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007451
7452static PyObject *
7453unicode_endswith(PyUnicodeObject *self,
7454 PyObject *args)
7455{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007456 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007457 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007458 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007459 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007460 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007461
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007462 if (!PyArg_ParseTuple(args, "O|O&O&:endswith", &subobj,
7463 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007464 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007465 if (PyTuple_Check(subobj)) {
7466 Py_ssize_t i;
7467 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
7468 substring = (PyUnicodeObject *)PyUnicode_FromObject(
7469 PyTuple_GET_ITEM(subobj, i));
7470 if (substring == NULL)
7471 return NULL;
7472 result = tailmatch(self, substring, start, end, +1);
7473 Py_DECREF(substring);
7474 if (result) {
7475 Py_RETURN_TRUE;
7476 }
7477 }
7478 Py_RETURN_FALSE;
7479 }
7480 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007481 if (substring == NULL)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007482 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007483
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007484 result = tailmatch(self, substring, start, end, +1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007485 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007486 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007487}
7488
7489
Guido van Rossum5d9113d2003-01-29 17:58:45 +00007490
7491static PyObject *
7492unicode_getnewargs(PyUnicodeObject *v)
7493{
7494 return Py_BuildValue("(u#)", v->str, v->length);
7495}
7496
7497
Guido van Rossumd57fd912000-03-10 22:53:23 +00007498static PyMethodDef unicode_methods[] = {
7499
7500 /* Order is according to common usage: often used methods should
7501 appear first, since lookup is done sequentially. */
7502
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007503 {"encode", (PyCFunction) unicode_encode, METH_VARARGS, encode__doc__},
7504 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
7505 {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007506 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007507 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
7508 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
7509 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
7510 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
7511 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
7512 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
7513 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +00007514 {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007515 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
7516 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
7517 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007518 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00007519 {"decode", (PyCFunction) unicode_decode, METH_VARARGS, decode__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007520/* {"maketrans", (PyCFunction) unicode_maketrans, METH_VARARGS, maketrans__doc__}, */
7521 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
7522 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
7523 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007524 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +00007525 {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007526 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS, splitlines__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007527 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007528 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
7529 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
7530 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
7531 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
7532 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
7533 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
7534 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
7535 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
7536 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
7537 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
7538 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
7539 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
7540 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
7541 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007542 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
Walter Dörwald068325e2002-04-15 13:36:47 +00007543#if 0
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007544 {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
Guido van Rossumd57fd912000-03-10 22:53:23 +00007545#endif
7546
7547#if 0
7548 /* This one is just used for debugging the implementation. */
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007549 {"freelistsize", (PyCFunction) unicode_freelistsize, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00007550#endif
7551
Guido van Rossum5d9113d2003-01-29 17:58:45 +00007552 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00007553 {NULL, NULL}
7554};
7555
Neil Schemenauerce30bc92002-11-18 16:10:18 +00007556static PyObject *
7557unicode_mod(PyObject *v, PyObject *w)
7558{
7559 if (!PyUnicode_Check(v)) {
7560 Py_INCREF(Py_NotImplemented);
7561 return Py_NotImplemented;
7562 }
7563 return PyUnicode_Format(v, w);
7564}
7565
7566static PyNumberMethods unicode_as_number = {
7567 0, /*nb_add*/
7568 0, /*nb_subtract*/
7569 0, /*nb_multiply*/
Neil Schemenauerce30bc92002-11-18 16:10:18 +00007570 unicode_mod, /*nb_remainder*/
7571};
7572
Guido van Rossumd57fd912000-03-10 22:53:23 +00007573static PySequenceMethods unicode_as_sequence = {
Martin v. Löwis18e16552006-02-15 17:27:45 +00007574 (lenfunc) unicode_length, /* sq_length */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007575 PyUnicode_Concat, /* sq_concat */
Martin v. Löwis18e16552006-02-15 17:27:45 +00007576 (ssizeargfunc) unicode_repeat, /* sq_repeat */
7577 (ssizeargfunc) unicode_getitem, /* sq_item */
7578 (ssizessizeargfunc) unicode_slice, /* sq_slice */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007579 0, /* sq_ass_item */
7580 0, /* sq_ass_slice */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007581 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007582};
7583
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007584static PyObject*
7585unicode_subscript(PyUnicodeObject* self, PyObject* item)
7586{
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00007587 if (PyIndex_Check(item)) {
7588 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007589 if (i == -1 && PyErr_Occurred())
7590 return NULL;
7591 if (i < 0)
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007592 i += PyUnicode_GET_SIZE(self);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007593 return unicode_getitem(self, i);
7594 } else if (PySlice_Check(item)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00007595 Py_ssize_t start, stop, step, slicelength, cur, i;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007596 Py_UNICODE* source_buf;
7597 Py_UNICODE* result_buf;
7598 PyObject* result;
7599
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007600 if (PySlice_GetIndicesEx((PySliceObject*)item, PyUnicode_GET_SIZE(self),
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007601 &start, &stop, &step, &slicelength) < 0) {
7602 return NULL;
7603 }
7604
7605 if (slicelength <= 0) {
7606 return PyUnicode_FromUnicode(NULL, 0);
7607 } else {
7608 source_buf = PyUnicode_AS_UNICODE((PyObject*)self);
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007609 result_buf = (Py_UNICODE *)PyMem_MALLOC(slicelength*
7610 sizeof(Py_UNICODE));
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007611
7612 if (result_buf == NULL)
7613 return PyErr_NoMemory();
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007614
7615 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
7616 result_buf[i] = source_buf[cur];
7617 }
Tim Petersced69f82003-09-16 20:30:58 +00007618
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007619 result = PyUnicode_FromUnicode(result_buf, slicelength);
7620 PyMem_FREE(result_buf);
7621 return result;
7622 }
7623 } else {
7624 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
7625 return NULL;
7626 }
7627}
7628
7629static PyMappingMethods unicode_as_mapping = {
Martin v. Löwis18e16552006-02-15 17:27:45 +00007630 (lenfunc)unicode_length, /* mp_length */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007631 (binaryfunc)unicode_subscript, /* mp_subscript */
7632 (objobjargproc)0, /* mp_ass_subscript */
7633};
7634
Martin v. Löwis18e16552006-02-15 17:27:45 +00007635static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +00007636unicode_buffer_getreadbuf(PyUnicodeObject *self,
Martin v. Löwis18e16552006-02-15 17:27:45 +00007637 Py_ssize_t index,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007638 const void **ptr)
7639{
7640 if (index != 0) {
7641 PyErr_SetString(PyExc_SystemError,
7642 "accessing non-existent unicode segment");
7643 return -1;
7644 }
7645 *ptr = (void *) self->str;
7646 return PyUnicode_GET_DATA_SIZE(self);
7647}
7648
Martin v. Löwis18e16552006-02-15 17:27:45 +00007649static Py_ssize_t
7650unicode_buffer_getwritebuf(PyUnicodeObject *self, Py_ssize_t index,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007651 const void **ptr)
7652{
7653 PyErr_SetString(PyExc_TypeError,
Neal Norwitz20e72132002-06-13 21:25:17 +00007654 "cannot use unicode as modifiable buffer");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007655 return -1;
7656}
7657
7658static int
7659unicode_buffer_getsegcount(PyUnicodeObject *self,
Martin v. Löwis18e16552006-02-15 17:27:45 +00007660 Py_ssize_t *lenp)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007661{
7662 if (lenp)
7663 *lenp = PyUnicode_GET_DATA_SIZE(self);
7664 return 1;
7665}
7666
Martin v. Löwiseb079f12006-02-16 14:32:27 +00007667static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +00007668unicode_buffer_getcharbuf(PyUnicodeObject *self,
Martin v. Löwis18e16552006-02-15 17:27:45 +00007669 Py_ssize_t index,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007670 const void **ptr)
7671{
7672 PyObject *str;
Tim Petersced69f82003-09-16 20:30:58 +00007673
Guido van Rossumd57fd912000-03-10 22:53:23 +00007674 if (index != 0) {
7675 PyErr_SetString(PyExc_SystemError,
7676 "accessing non-existent unicode segment");
7677 return -1;
7678 }
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00007679 str = _PyUnicode_AsDefaultEncodedString((PyObject *)self, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007680 if (str == NULL)
7681 return -1;
7682 *ptr = (void *) PyString_AS_STRING(str);
7683 return PyString_GET_SIZE(str);
7684}
7685
7686/* Helpers for PyUnicode_Format() */
7687
7688static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +00007689getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007690{
Martin v. Löwis18e16552006-02-15 17:27:45 +00007691 Py_ssize_t argidx = *p_argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007692 if (argidx < arglen) {
7693 (*p_argidx)++;
7694 if (arglen < 0)
7695 return args;
7696 else
7697 return PyTuple_GetItem(args, argidx);
7698 }
7699 PyErr_SetString(PyExc_TypeError,
7700 "not enough arguments for format string");
7701 return NULL;
7702}
7703
7704#define F_LJUST (1<<0)
7705#define F_SIGN (1<<1)
7706#define F_BLANK (1<<2)
7707#define F_ALT (1<<3)
7708#define F_ZERO (1<<4)
7709
Martin v. Löwis18e16552006-02-15 17:27:45 +00007710static Py_ssize_t
Neal Norwitzfc76d632006-01-10 06:03:13 +00007711strtounicode(Py_UNICODE *buffer, const char *charbuffer)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007712{
Martin v. Löwis18e16552006-02-15 17:27:45 +00007713 register Py_ssize_t i;
7714 Py_ssize_t len = strlen(charbuffer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007715 for (i = len - 1; i >= 0; i--)
7716 buffer[i] = (Py_UNICODE) charbuffer[i];
7717
Guido van Rossumd57fd912000-03-10 22:53:23 +00007718 return len;
7719}
7720
Neal Norwitzfc76d632006-01-10 06:03:13 +00007721static int
7722doubletounicode(Py_UNICODE *buffer, size_t len, const char *format, double x)
7723{
Tim Peters15231542006-02-16 01:08:01 +00007724 Py_ssize_t result;
7725
Neal Norwitzfc76d632006-01-10 06:03:13 +00007726 PyOS_ascii_formatd((char *)buffer, len, format, x);
Tim Peters15231542006-02-16 01:08:01 +00007727 result = strtounicode(buffer, (char *)buffer);
7728 return Py_SAFE_DOWNCAST(result, Py_ssize_t, int);
Neal Norwitzfc76d632006-01-10 06:03:13 +00007729}
7730
7731static int
7732longtounicode(Py_UNICODE *buffer, size_t len, const char *format, long x)
7733{
Tim Peters15231542006-02-16 01:08:01 +00007734 Py_ssize_t result;
7735
Neal Norwitzfc76d632006-01-10 06:03:13 +00007736 PyOS_snprintf((char *)buffer, len, format, x);
Tim Peters15231542006-02-16 01:08:01 +00007737 result = strtounicode(buffer, (char *)buffer);
7738 return Py_SAFE_DOWNCAST(result, Py_ssize_t, int);
Neal Norwitzfc76d632006-01-10 06:03:13 +00007739}
7740
Guido van Rossum078151d2002-08-11 04:24:12 +00007741/* XXX To save some code duplication, formatfloat/long/int could have been
7742 shared with stringobject.c, converting from 8-bit to Unicode after the
7743 formatting is done. */
7744
Guido van Rossumd57fd912000-03-10 22:53:23 +00007745static int
7746formatfloat(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007747 size_t buflen,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007748 int flags,
7749 int prec,
7750 int type,
7751 PyObject *v)
7752{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007753 /* fmt = '%#.' + `prec` + `type`
7754 worst case length = 3 + 10 (len of INT_MAX) + 1 = 14 (use 20)*/
Guido van Rossumd57fd912000-03-10 22:53:23 +00007755 char fmt[20];
7756 double x;
Tim Petersced69f82003-09-16 20:30:58 +00007757
Guido van Rossumd57fd912000-03-10 22:53:23 +00007758 x = PyFloat_AsDouble(v);
7759 if (x == -1.0 && PyErr_Occurred())
7760 return -1;
7761 if (prec < 0)
7762 prec = 6;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007763 if (type == 'f' && (fabs(x) / 1e25) >= 1e25)
7764 type = 'g';
Marc-André Lemburg79f57832002-12-29 19:44:06 +00007765 /* Worst case length calc to ensure no buffer overrun:
7766
7767 'g' formats:
7768 fmt = %#.<prec>g
7769 buf = '-' + [0-9]*prec + '.' + 'e+' + (longest exp
7770 for any double rep.)
7771 len = 1 + prec + 1 + 2 + 5 = 9 + prec
7772
7773 'f' formats:
7774 buf = '-' + [0-9]*x + '.' + [0-9]*prec (with x < 50)
7775 len = 1 + 50 + 1 + prec = 52 + prec
7776
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007777 If prec=0 the effective precision is 1 (the leading digit is
Tim Petersced69f82003-09-16 20:30:58 +00007778 always given), therefore increase the length by one.
Marc-André Lemburg79f57832002-12-29 19:44:06 +00007779
7780 */
7781 if ((type == 'g' && buflen <= (size_t)10 + (size_t)prec) ||
7782 (type == 'f' && buflen <= (size_t)53 + (size_t)prec)) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007783 PyErr_SetString(PyExc_OverflowError,
Marc-André Lemburg79f57832002-12-29 19:44:06 +00007784 "formatted float is too long (precision too large?)");
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007785 return -1;
7786 }
Marc-André Lemburg79f57832002-12-29 19:44:06 +00007787 PyOS_snprintf(fmt, sizeof(fmt), "%%%s.%d%c",
7788 (flags&F_ALT) ? "#" : "",
7789 prec, type);
Neal Norwitzfc76d632006-01-10 06:03:13 +00007790 return doubletounicode(buf, buflen, fmt, x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007791}
7792
Tim Peters38fd5b62000-09-21 05:43:11 +00007793static PyObject*
7794formatlong(PyObject *val, int flags, int prec, int type)
7795{
7796 char *buf;
7797 int i, len;
7798 PyObject *str; /* temporary string object. */
7799 PyUnicodeObject *result;
7800
7801 str = _PyString_FormatLong(val, flags, prec, type, &buf, &len);
7802 if (!str)
7803 return NULL;
7804 result = _PyUnicode_New(len);
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00007805 if (!result) {
7806 Py_DECREF(str);
7807 return NULL;
7808 }
Tim Peters38fd5b62000-09-21 05:43:11 +00007809 for (i = 0; i < len; i++)
7810 result->str[i] = buf[i];
7811 result->str[len] = 0;
7812 Py_DECREF(str);
7813 return (PyObject*)result;
7814}
7815
Guido van Rossumd57fd912000-03-10 22:53:23 +00007816static int
7817formatint(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007818 size_t buflen,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007819 int flags,
7820 int prec,
7821 int type,
7822 PyObject *v)
7823{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007824 /* fmt = '%#.' + `prec` + 'l' + `type`
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00007825 * worst case length = 3 + 19 (worst len of INT_MAX on 64-bit machine)
7826 * + 1 + 1
7827 * = 24
7828 */
Tim Peters38fd5b62000-09-21 05:43:11 +00007829 char fmt[64]; /* plenty big enough! */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00007830 char *sign;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007831 long x;
7832
7833 x = PyInt_AsLong(v);
7834 if (x == -1 && PyErr_Occurred())
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00007835 return -1;
Guido van Rossum6c9e1302003-11-29 23:52:13 +00007836 if (x < 0 && type == 'u') {
7837 type = 'd';
Guido van Rossum078151d2002-08-11 04:24:12 +00007838 }
Guido van Rossum6c9e1302003-11-29 23:52:13 +00007839 if (x < 0 && (type == 'x' || type == 'X' || type == 'o'))
7840 sign = "-";
7841 else
7842 sign = "";
Guido van Rossumd57fd912000-03-10 22:53:23 +00007843 if (prec < 0)
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00007844 prec = 1;
7845
Guido van Rossum6c9e1302003-11-29 23:52:13 +00007846 /* buf = '+'/'-'/'' + '0'/'0x'/'' + '[0-9]'*max(prec, len(x in octal))
7847 * worst case buf = '-0x' + [0-9]*prec, where prec >= 11
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00007848 */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00007849 if (buflen <= 14 || buflen <= (size_t)3 + (size_t)prec) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007850 PyErr_SetString(PyExc_OverflowError,
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00007851 "formatted integer is too long (precision too large?)");
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007852 return -1;
7853 }
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00007854
7855 if ((flags & F_ALT) &&
7856 (type == 'x' || type == 'X')) {
Tim Petersced69f82003-09-16 20:30:58 +00007857 /* When converting under %#x or %#X, there are a number
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00007858 * of issues that cause pain:
7859 * - when 0 is being converted, the C standard leaves off
7860 * the '0x' or '0X', which is inconsistent with other
7861 * %#x/%#X conversions and inconsistent with Python's
7862 * hex() function
7863 * - there are platforms that violate the standard and
7864 * convert 0 with the '0x' or '0X'
7865 * (Metrowerks, Compaq Tru64)
7866 * - there are platforms that give '0x' when converting
Tim Petersced69f82003-09-16 20:30:58 +00007867 * under %#X, but convert 0 in accordance with the
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00007868 * standard (OS/2 EMX)
Tim Petersced69f82003-09-16 20:30:58 +00007869 *
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00007870 * We can achieve the desired consistency by inserting our
7871 * own '0x' or '0X' prefix, and substituting %x/%X in place
7872 * of %#x/%#X.
7873 *
7874 * Note that this is the same approach as used in
7875 * formatint() in stringobject.c
Andrew MacIntyrec4874392002-02-26 11:36:35 +00007876 */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00007877 PyOS_snprintf(fmt, sizeof(fmt), "%s0%c%%.%dl%c",
7878 sign, type, prec, type);
Andrew MacIntyrec4874392002-02-26 11:36:35 +00007879 }
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00007880 else {
Guido van Rossum6c9e1302003-11-29 23:52:13 +00007881 PyOS_snprintf(fmt, sizeof(fmt), "%s%%%s.%dl%c",
7882 sign, (flags&F_ALT) ? "#" : "",
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00007883 prec, type);
Tim Petersb3d8d1f2001-04-28 05:38:26 +00007884 }
Guido van Rossum6c9e1302003-11-29 23:52:13 +00007885 if (sign[0])
Neal Norwitzfc76d632006-01-10 06:03:13 +00007886 return longtounicode(buf, buflen, fmt, -x);
Guido van Rossum6c9e1302003-11-29 23:52:13 +00007887 else
Neal Norwitzfc76d632006-01-10 06:03:13 +00007888 return longtounicode(buf, buflen, fmt, x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007889}
7890
7891static int
7892formatchar(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007893 size_t buflen,
7894 PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007895{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007896 /* presume that the buffer is at least 2 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00007897 if (PyUnicode_Check(v)) {
7898 if (PyUnicode_GET_SIZE(v) != 1)
7899 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007900 buf[0] = PyUnicode_AS_UNICODE(v)[0];
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00007901 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007902
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00007903 else if (PyString_Check(v)) {
Tim Petersced69f82003-09-16 20:30:58 +00007904 if (PyString_GET_SIZE(v) != 1)
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00007905 goto onError;
7906 buf[0] = (Py_UNICODE)PyString_AS_STRING(v)[0];
7907 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007908
7909 else {
7910 /* Integer input truncated to a character */
7911 long x;
7912 x = PyInt_AsLong(v);
7913 if (x == -1 && PyErr_Occurred())
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00007914 goto onError;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00007915#ifdef Py_UNICODE_WIDE
7916 if (x < 0 || x > 0x10ffff) {
Walter Dörwald44f527f2003-04-02 16:37:24 +00007917 PyErr_SetString(PyExc_OverflowError,
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00007918 "%c arg not in range(0x110000) "
7919 "(wide Python build)");
7920 return -1;
7921 }
7922#else
7923 if (x < 0 || x > 0xffff) {
Walter Dörwald44f527f2003-04-02 16:37:24 +00007924 PyErr_SetString(PyExc_OverflowError,
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00007925 "%c arg not in range(0x10000) "
7926 "(narrow Python build)");
7927 return -1;
7928 }
7929#endif
7930 buf[0] = (Py_UNICODE) x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007931 }
7932 buf[1] = '\0';
7933 return 1;
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00007934
7935 onError:
7936 PyErr_SetString(PyExc_TypeError,
7937 "%c requires int or char");
7938 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007939}
7940
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007941/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
7942
7943 FORMATBUFLEN is the length of the buffer in which the floats, ints, &
7944 chars are formatted. XXX This is a magic number. Each formatting
7945 routine does bounds checking to ensure no overflow, but a better
7946 solution may be to malloc a buffer of appropriate size for each
7947 format. For now, the current solution is sufficient.
7948*/
7949#define FORMATBUFLEN (size_t)120
7950
Guido van Rossumd57fd912000-03-10 22:53:23 +00007951PyObject *PyUnicode_Format(PyObject *format,
7952 PyObject *args)
7953{
7954 Py_UNICODE *fmt, *res;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007955 Py_ssize_t fmtcnt, rescnt, reslen, arglen, argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007956 int args_owned = 0;
7957 PyUnicodeObject *result = NULL;
7958 PyObject *dict = NULL;
7959 PyObject *uformat;
Tim Petersced69f82003-09-16 20:30:58 +00007960
Guido van Rossumd57fd912000-03-10 22:53:23 +00007961 if (format == NULL || args == NULL) {
7962 PyErr_BadInternalCall();
7963 return NULL;
7964 }
7965 uformat = PyUnicode_FromObject(format);
Fred Drakee4315f52000-05-09 19:53:39 +00007966 if (uformat == NULL)
7967 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007968 fmt = PyUnicode_AS_UNICODE(uformat);
7969 fmtcnt = PyUnicode_GET_SIZE(uformat);
7970
7971 reslen = rescnt = fmtcnt + 100;
7972 result = _PyUnicode_New(reslen);
7973 if (result == NULL)
7974 goto onError;
7975 res = PyUnicode_AS_UNICODE(result);
7976
7977 if (PyTuple_Check(args)) {
7978 arglen = PyTuple_Size(args);
7979 argidx = 0;
7980 }
7981 else {
7982 arglen = -1;
7983 argidx = -2;
7984 }
Neal Norwitz80a1bf42002-11-12 23:01:12 +00007985 if (args->ob_type->tp_as_mapping && !PyTuple_Check(args) &&
7986 !PyObject_TypeCheck(args, &PyBaseString_Type))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007987 dict = args;
7988
7989 while (--fmtcnt >= 0) {
7990 if (*fmt != '%') {
7991 if (--rescnt < 0) {
7992 rescnt = fmtcnt + 100;
7993 reslen += rescnt;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00007994 if (_PyUnicode_Resize(&result, reslen) < 0)
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00007995 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007996 res = PyUnicode_AS_UNICODE(result) + reslen - rescnt;
7997 --rescnt;
7998 }
7999 *res++ = *fmt++;
8000 }
8001 else {
8002 /* Got a format specifier */
8003 int flags = 0;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008004 Py_ssize_t width = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008005 int prec = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008006 Py_UNICODE c = '\0';
8007 Py_UNICODE fill;
8008 PyObject *v = NULL;
8009 PyObject *temp = NULL;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008010 Py_UNICODE *pbuf;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008011 Py_UNICODE sign;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008012 Py_ssize_t len;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008013 Py_UNICODE formatbuf[FORMATBUFLEN]; /* For format{float,int,char}() */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008014
8015 fmt++;
8016 if (*fmt == '(') {
8017 Py_UNICODE *keystart;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008018 Py_ssize_t keylen;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008019 PyObject *key;
8020 int pcount = 1;
8021
8022 if (dict == NULL) {
8023 PyErr_SetString(PyExc_TypeError,
Tim Petersced69f82003-09-16 20:30:58 +00008024 "format requires a mapping");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008025 goto onError;
8026 }
8027 ++fmt;
8028 --fmtcnt;
8029 keystart = fmt;
8030 /* Skip over balanced parentheses */
8031 while (pcount > 0 && --fmtcnt >= 0) {
8032 if (*fmt == ')')
8033 --pcount;
8034 else if (*fmt == '(')
8035 ++pcount;
8036 fmt++;
8037 }
8038 keylen = fmt - keystart - 1;
8039 if (fmtcnt < 0 || pcount > 0) {
8040 PyErr_SetString(PyExc_ValueError,
8041 "incomplete format key");
8042 goto onError;
8043 }
Marc-André Lemburg72f82132001-11-20 15:18:49 +00008044#if 0
Fred Drakee4315f52000-05-09 19:53:39 +00008045 /* keys are converted to strings using UTF-8 and
Guido van Rossumd57fd912000-03-10 22:53:23 +00008046 then looked up since Python uses strings to hold
8047 variables names etc. in its namespaces and we
Fred Drakee4315f52000-05-09 19:53:39 +00008048 wouldn't want to break common idioms. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008049 key = PyUnicode_EncodeUTF8(keystart,
8050 keylen,
8051 NULL);
Marc-André Lemburg72f82132001-11-20 15:18:49 +00008052#else
8053 key = PyUnicode_FromUnicode(keystart, keylen);
8054#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00008055 if (key == NULL)
8056 goto onError;
8057 if (args_owned) {
8058 Py_DECREF(args);
8059 args_owned = 0;
8060 }
8061 args = PyObject_GetItem(dict, key);
8062 Py_DECREF(key);
8063 if (args == NULL) {
8064 goto onError;
8065 }
8066 args_owned = 1;
8067 arglen = -1;
8068 argidx = -2;
8069 }
8070 while (--fmtcnt >= 0) {
8071 switch (c = *fmt++) {
8072 case '-': flags |= F_LJUST; continue;
8073 case '+': flags |= F_SIGN; continue;
8074 case ' ': flags |= F_BLANK; continue;
8075 case '#': flags |= F_ALT; continue;
8076 case '0': flags |= F_ZERO; continue;
8077 }
8078 break;
8079 }
8080 if (c == '*') {
8081 v = getnextarg(args, arglen, &argidx);
8082 if (v == NULL)
8083 goto onError;
8084 if (!PyInt_Check(v)) {
8085 PyErr_SetString(PyExc_TypeError,
8086 "* wants int");
8087 goto onError;
8088 }
8089 width = PyInt_AsLong(v);
Guido van Rossumddefaf32007-01-14 03:31:43 +00008090 if (width == -1 && PyErr_Occurred())
8091 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008092 if (width < 0) {
8093 flags |= F_LJUST;
8094 width = -width;
8095 }
8096 if (--fmtcnt >= 0)
8097 c = *fmt++;
8098 }
8099 else if (c >= '0' && c <= '9') {
8100 width = c - '0';
8101 while (--fmtcnt >= 0) {
8102 c = *fmt++;
8103 if (c < '0' || c > '9')
8104 break;
8105 if ((width*10) / 10 != width) {
8106 PyErr_SetString(PyExc_ValueError,
8107 "width too big");
8108 goto onError;
8109 }
8110 width = width*10 + (c - '0');
8111 }
8112 }
8113 if (c == '.') {
8114 prec = 0;
8115 if (--fmtcnt >= 0)
8116 c = *fmt++;
8117 if (c == '*') {
8118 v = getnextarg(args, arglen, &argidx);
8119 if (v == NULL)
8120 goto onError;
8121 if (!PyInt_Check(v)) {
8122 PyErr_SetString(PyExc_TypeError,
8123 "* wants int");
8124 goto onError;
8125 }
8126 prec = PyInt_AsLong(v);
Guido van Rossumddefaf32007-01-14 03:31:43 +00008127 if (prec == -1 && PyErr_Occurred())
8128 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008129 if (prec < 0)
8130 prec = 0;
8131 if (--fmtcnt >= 0)
8132 c = *fmt++;
8133 }
8134 else if (c >= '0' && c <= '9') {
8135 prec = c - '0';
8136 while (--fmtcnt >= 0) {
8137 c = Py_CHARMASK(*fmt++);
8138 if (c < '0' || c > '9')
8139 break;
8140 if ((prec*10) / 10 != prec) {
8141 PyErr_SetString(PyExc_ValueError,
8142 "prec too big");
8143 goto onError;
8144 }
8145 prec = prec*10 + (c - '0');
8146 }
8147 }
8148 } /* prec */
8149 if (fmtcnt >= 0) {
8150 if (c == 'h' || c == 'l' || c == 'L') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00008151 if (--fmtcnt >= 0)
8152 c = *fmt++;
8153 }
8154 }
8155 if (fmtcnt < 0) {
8156 PyErr_SetString(PyExc_ValueError,
8157 "incomplete format");
8158 goto onError;
8159 }
8160 if (c != '%') {
8161 v = getnextarg(args, arglen, &argidx);
8162 if (v == NULL)
8163 goto onError;
8164 }
8165 sign = 0;
8166 fill = ' ';
8167 switch (c) {
8168
8169 case '%':
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008170 pbuf = formatbuf;
8171 /* presume that buffer length is at least 1 */
8172 pbuf[0] = '%';
Guido van Rossumd57fd912000-03-10 22:53:23 +00008173 len = 1;
8174 break;
8175
8176 case 's':
8177 case 'r':
8178 if (PyUnicode_Check(v) && c == 's') {
8179 temp = v;
8180 Py_INCREF(temp);
8181 }
8182 else {
8183 PyObject *unicode;
8184 if (c == 's')
Marc-André Lemburgd25c6502004-07-23 16:13:25 +00008185 temp = PyObject_Unicode(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008186 else
8187 temp = PyObject_Repr(v);
8188 if (temp == NULL)
8189 goto onError;
Marc-André Lemburgd25c6502004-07-23 16:13:25 +00008190 if (PyUnicode_Check(temp))
8191 /* nothing to do */;
8192 else if (PyString_Check(temp)) {
8193 /* convert to string to Unicode */
Thomas Woutersa96affe2006-03-12 00:29:36 +00008194 unicode = PyUnicode_Decode(PyString_AS_STRING(temp),
Guido van Rossumd57fd912000-03-10 22:53:23 +00008195 PyString_GET_SIZE(temp),
Thomas Woutersa96affe2006-03-12 00:29:36 +00008196 NULL,
Guido van Rossumd57fd912000-03-10 22:53:23 +00008197 "strict");
Thomas Woutersa96affe2006-03-12 00:29:36 +00008198 Py_DECREF(temp);
8199 temp = unicode;
8200 if (temp == NULL)
8201 goto onError;
8202 }
Marc-André Lemburgd25c6502004-07-23 16:13:25 +00008203 else {
8204 Py_DECREF(temp);
8205 PyErr_SetString(PyExc_TypeError,
8206 "%s argument has non-string str()");
8207 goto onError;
8208 }
8209 }
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008210 pbuf = PyUnicode_AS_UNICODE(temp);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008211 len = PyUnicode_GET_SIZE(temp);
8212 if (prec >= 0 && len > prec)
8213 len = prec;
8214 break;
8215
8216 case 'i':
8217 case 'd':
8218 case 'u':
8219 case 'o':
8220 case 'x':
8221 case 'X':
8222 if (c == 'i')
8223 c = 'd';
Tim Petersa3a3a032000-11-30 05:22:44 +00008224 if (PyLong_Check(v)) {
Tim Peters38fd5b62000-09-21 05:43:11 +00008225 temp = formatlong(v, flags, prec, c);
8226 if (!temp)
8227 goto onError;
8228 pbuf = PyUnicode_AS_UNICODE(temp);
8229 len = PyUnicode_GET_SIZE(temp);
Tim Peters38fd5b62000-09-21 05:43:11 +00008230 sign = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008231 }
Tim Peters38fd5b62000-09-21 05:43:11 +00008232 else {
8233 pbuf = formatbuf;
8234 len = formatint(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
8235 flags, prec, c, v);
8236 if (len < 0)
8237 goto onError;
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008238 sign = 1;
Tim Peters38fd5b62000-09-21 05:43:11 +00008239 }
8240 if (flags & F_ZERO)
8241 fill = '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00008242 break;
8243
8244 case 'e':
8245 case 'E':
8246 case 'f':
Raymond Hettinger9bfe5332003-08-27 04:55:52 +00008247 case 'F':
Guido van Rossumd57fd912000-03-10 22:53:23 +00008248 case 'g':
8249 case 'G':
Raymond Hettinger9bfe5332003-08-27 04:55:52 +00008250 if (c == 'F')
8251 c = 'f';
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008252 pbuf = formatbuf;
8253 len = formatfloat(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
8254 flags, prec, c, v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008255 if (len < 0)
8256 goto onError;
8257 sign = 1;
Tim Peters38fd5b62000-09-21 05:43:11 +00008258 if (flags & F_ZERO)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008259 fill = '0';
8260 break;
8261
8262 case 'c':
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008263 pbuf = formatbuf;
8264 len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008265 if (len < 0)
8266 goto onError;
8267 break;
8268
8269 default:
8270 PyErr_Format(PyExc_ValueError,
Andrew M. Kuchling6ca89172000-12-15 13:07:46 +00008271 "unsupported format character '%c' (0x%x) "
Thomas Wouters89f507f2006-12-13 04:49:30 +00008272 "at index %zd",
Tim Petersced69f82003-09-16 20:30:58 +00008273 (31<=c && c<=126) ? (char)c : '?',
Marc-André Lemburg24e53b62002-09-24 09:32:14 +00008274 (int)c,
Thomas Wouters89f507f2006-12-13 04:49:30 +00008275 (Py_ssize_t)(fmt - 1 -
8276 PyUnicode_AS_UNICODE(uformat)));
Guido van Rossumd57fd912000-03-10 22:53:23 +00008277 goto onError;
8278 }
8279 if (sign) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008280 if (*pbuf == '-' || *pbuf == '+') {
8281 sign = *pbuf++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008282 len--;
8283 }
8284 else if (flags & F_SIGN)
8285 sign = '+';
8286 else if (flags & F_BLANK)
8287 sign = ' ';
8288 else
8289 sign = 0;
8290 }
8291 if (width < len)
8292 width = len;
Guido van Rossum049cd6b2002-10-11 00:43:48 +00008293 if (rescnt - (sign != 0) < width) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00008294 reslen -= rescnt;
8295 rescnt = width + fmtcnt + 100;
8296 reslen += rescnt;
Guido van Rossum049cd6b2002-10-11 00:43:48 +00008297 if (reslen < 0) {
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00008298 Py_XDECREF(temp);
Thomas Woutersa96affe2006-03-12 00:29:36 +00008299 PyErr_NoMemory();
8300 goto onError;
Guido van Rossum049cd6b2002-10-11 00:43:48 +00008301 }
Thomas Woutersa96affe2006-03-12 00:29:36 +00008302 if (_PyUnicode_Resize(&result, reslen) < 0) {
8303 Py_XDECREF(temp);
8304 goto onError;
8305 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008306 res = PyUnicode_AS_UNICODE(result)
8307 + reslen - rescnt;
8308 }
8309 if (sign) {
8310 if (fill != ' ')
8311 *res++ = sign;
8312 rescnt--;
8313 if (width > len)
8314 width--;
8315 }
Tim Peters38fd5b62000-09-21 05:43:11 +00008316 if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
8317 assert(pbuf[0] == '0');
Tim Petersfff53252001-04-12 18:38:48 +00008318 assert(pbuf[1] == c);
8319 if (fill != ' ') {
8320 *res++ = *pbuf++;
8321 *res++ = *pbuf++;
Tim Peters38fd5b62000-09-21 05:43:11 +00008322 }
Tim Petersfff53252001-04-12 18:38:48 +00008323 rescnt -= 2;
8324 width -= 2;
8325 if (width < 0)
8326 width = 0;
8327 len -= 2;
Tim Peters38fd5b62000-09-21 05:43:11 +00008328 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008329 if (width > len && !(flags & F_LJUST)) {
8330 do {
8331 --rescnt;
8332 *res++ = fill;
8333 } while (--width > len);
8334 }
Tim Peters38fd5b62000-09-21 05:43:11 +00008335 if (fill == ' ') {
8336 if (sign)
8337 *res++ = sign;
Tim Petersfff53252001-04-12 18:38:48 +00008338 if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
Tim Peters38fd5b62000-09-21 05:43:11 +00008339 assert(pbuf[0] == '0');
Tim Petersfff53252001-04-12 18:38:48 +00008340 assert(pbuf[1] == c);
Tim Peters38fd5b62000-09-21 05:43:11 +00008341 *res++ = *pbuf++;
8342 *res++ = *pbuf++;
8343 }
8344 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008345 Py_UNICODE_COPY(res, pbuf, len);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008346 res += len;
8347 rescnt -= len;
8348 while (--width >= len) {
8349 --rescnt;
8350 *res++ = ' ';
8351 }
8352 if (dict && (argidx < arglen) && c != '%') {
8353 PyErr_SetString(PyExc_TypeError,
Raymond Hettinger0ebac972002-05-21 15:14:57 +00008354 "not all arguments converted during string formatting");
Thomas Woutersa96affe2006-03-12 00:29:36 +00008355 Py_XDECREF(temp);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008356 goto onError;
8357 }
8358 Py_XDECREF(temp);
8359 } /* '%' */
8360 } /* until end */
8361 if (argidx < arglen && !dict) {
8362 PyErr_SetString(PyExc_TypeError,
Raymond Hettinger0ebac972002-05-21 15:14:57 +00008363 "not all arguments converted during string formatting");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008364 goto onError;
8365 }
8366
Thomas Woutersa96affe2006-03-12 00:29:36 +00008367 if (_PyUnicode_Resize(&result, reslen - rescnt) < 0)
8368 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008369 if (args_owned) {
8370 Py_DECREF(args);
8371 }
8372 Py_DECREF(uformat);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008373 return (PyObject *)result;
8374
8375 onError:
8376 Py_XDECREF(result);
8377 Py_DECREF(uformat);
8378 if (args_owned) {
8379 Py_DECREF(args);
8380 }
8381 return NULL;
8382}
8383
8384static PyBufferProcs unicode_as_buffer = {
Martin v. Löwis18e16552006-02-15 17:27:45 +00008385 (readbufferproc) unicode_buffer_getreadbuf,
8386 (writebufferproc) unicode_buffer_getwritebuf,
8387 (segcountproc) unicode_buffer_getsegcount,
8388 (charbufferproc) unicode_buffer_getcharbuf,
Guido van Rossumd57fd912000-03-10 22:53:23 +00008389};
8390
Jeremy Hylton938ace62002-07-17 16:30:39 +00008391static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +00008392unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
8393
Tim Peters6d6c1a32001-08-02 04:15:00 +00008394static PyObject *
8395unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
8396{
8397 PyObject *x = NULL;
Martin v. Löwis15e62742006-02-27 16:46:16 +00008398 static char *kwlist[] = {"string", "encoding", "errors", 0};
Tim Peters6d6c1a32001-08-02 04:15:00 +00008399 char *encoding = NULL;
8400 char *errors = NULL;
8401
Guido van Rossume023fe02001-08-30 03:12:59 +00008402 if (type != &PyUnicode_Type)
8403 return unicode_subtype_new(type, args, kwds);
Tim Peters6d6c1a32001-08-02 04:15:00 +00008404 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:unicode",
8405 kwlist, &x, &encoding, &errors))
8406 return NULL;
8407 if (x == NULL)
8408 return (PyObject *)_PyUnicode_New(0);
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00008409 if (encoding == NULL && errors == NULL)
8410 return PyObject_Unicode(x);
8411 else
Tim Peters6d6c1a32001-08-02 04:15:00 +00008412 return PyUnicode_FromEncodedObject(x, encoding, errors);
8413}
8414
Guido van Rossume023fe02001-08-30 03:12:59 +00008415static PyObject *
8416unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
8417{
Tim Petersaf90b3e2001-09-12 05:18:58 +00008418 PyUnicodeObject *tmp, *pnew;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008419 Py_ssize_t n;
Guido van Rossume023fe02001-08-30 03:12:59 +00008420
8421 assert(PyType_IsSubtype(type, &PyUnicode_Type));
8422 tmp = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds);
8423 if (tmp == NULL)
8424 return NULL;
8425 assert(PyUnicode_Check(tmp));
Tim Petersaf90b3e2001-09-12 05:18:58 +00008426 pnew = (PyUnicodeObject *) type->tp_alloc(type, n = tmp->length);
Raymond Hettingerf4667932003-06-28 20:04:25 +00008427 if (pnew == NULL) {
8428 Py_DECREF(tmp);
Guido van Rossume023fe02001-08-30 03:12:59 +00008429 return NULL;
Raymond Hettingerf4667932003-06-28 20:04:25 +00008430 }
Tim Petersaf90b3e2001-09-12 05:18:58 +00008431 pnew->str = PyMem_NEW(Py_UNICODE, n+1);
8432 if (pnew->str == NULL) {
8433 _Py_ForgetReference((PyObject *)pnew);
Neil Schemenauer58aa8612002-04-12 03:07:20 +00008434 PyObject_Del(pnew);
Raymond Hettingerf4667932003-06-28 20:04:25 +00008435 Py_DECREF(tmp);
Neal Norwitzec74f2f2003-02-11 23:05:40 +00008436 return PyErr_NoMemory();
Guido van Rossume023fe02001-08-30 03:12:59 +00008437 }
Tim Petersaf90b3e2001-09-12 05:18:58 +00008438 Py_UNICODE_COPY(pnew->str, tmp->str, n+1);
8439 pnew->length = n;
8440 pnew->hash = tmp->hash;
Guido van Rossume023fe02001-08-30 03:12:59 +00008441 Py_DECREF(tmp);
Tim Petersaf90b3e2001-09-12 05:18:58 +00008442 return (PyObject *)pnew;
Guido van Rossume023fe02001-08-30 03:12:59 +00008443}
8444
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008445PyDoc_STRVAR(unicode_doc,
Tim Peters6d6c1a32001-08-02 04:15:00 +00008446"unicode(string [, encoding[, errors]]) -> object\n\
8447\n\
8448Create a new Unicode object from the given encoded string.\n\
Skip Montanaro35b37a52002-07-26 16:22:46 +00008449encoding defaults to the current default string encoding.\n\
8450errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +00008451
Guido van Rossum50e9fb92006-08-17 05:42:55 +00008452static PyObject *unicode_iter(PyObject *seq);
8453
Guido van Rossumd57fd912000-03-10 22:53:23 +00008454PyTypeObject PyUnicode_Type = {
8455 PyObject_HEAD_INIT(&PyType_Type)
8456 0, /* ob_size */
Guido van Rossum84fc66d2007-05-03 17:18:26 +00008457 "str", /* tp_name */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008458 sizeof(PyUnicodeObject), /* tp_size */
8459 0, /* tp_itemsize */
8460 /* Slots */
Guido van Rossum9475a232001-10-05 20:51:39 +00008461 (destructor)unicode_dealloc, /* tp_dealloc */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008462 0, /* tp_print */
Tim Peters6d6c1a32001-08-02 04:15:00 +00008463 0, /* tp_getattr */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008464 0, /* tp_setattr */
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00008465 0, /* tp_compare */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008466 unicode_repr, /* tp_repr */
Neil Schemenauerce30bc92002-11-18 16:10:18 +00008467 &unicode_as_number, /* tp_as_number */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008468 &unicode_as_sequence, /* tp_as_sequence */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008469 &unicode_as_mapping, /* tp_as_mapping */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008470 (hashfunc) unicode_hash, /* tp_hash*/
8471 0, /* tp_call*/
8472 (reprfunc) unicode_str, /* tp_str */
Tim Peters6d6c1a32001-08-02 04:15:00 +00008473 PyObject_GenericGetAttr, /* tp_getattro */
8474 0, /* tp_setattro */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008475 &unicode_as_buffer, /* tp_as_buffer */
Thomas Wouters27d517b2007-02-25 20:39:11 +00008476 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
8477 Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
Tim Peters6d6c1a32001-08-02 04:15:00 +00008478 unicode_doc, /* tp_doc */
8479 0, /* tp_traverse */
8480 0, /* tp_clear */
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00008481 PyUnicode_RichCompare, /* tp_richcompare */
Tim Peters6d6c1a32001-08-02 04:15:00 +00008482 0, /* tp_weaklistoffset */
Guido van Rossum50e9fb92006-08-17 05:42:55 +00008483 unicode_iter, /* tp_iter */
Tim Peters6d6c1a32001-08-02 04:15:00 +00008484 0, /* tp_iternext */
8485 unicode_methods, /* tp_methods */
8486 0, /* tp_members */
8487 0, /* tp_getset */
Guido van Rossumcacfc072002-05-24 19:01:59 +00008488 &PyBaseString_Type, /* tp_base */
Tim Peters6d6c1a32001-08-02 04:15:00 +00008489 0, /* tp_dict */
8490 0, /* tp_descr_get */
8491 0, /* tp_descr_set */
8492 0, /* tp_dictoffset */
8493 0, /* tp_init */
8494 0, /* tp_alloc */
8495 unicode_new, /* tp_new */
Neil Schemenauer58aa8612002-04-12 03:07:20 +00008496 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008497};
8498
8499/* Initialize the Unicode implementation */
8500
Thomas Wouters78890102000-07-22 19:25:51 +00008501void _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008502{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008503 int i;
8504
Thomas Wouters477c8d52006-05-27 19:21:47 +00008505 /* XXX - move this array to unicodectype.c ? */
8506 Py_UNICODE linebreak[] = {
8507 0x000A, /* LINE FEED */
8508 0x000D, /* CARRIAGE RETURN */
8509 0x001C, /* FILE SEPARATOR */
8510 0x001D, /* GROUP SEPARATOR */
8511 0x001E, /* RECORD SEPARATOR */
8512 0x0085, /* NEXT LINE */
8513 0x2028, /* LINE SEPARATOR */
8514 0x2029, /* PARAGRAPH SEPARATOR */
8515 };
8516
Fred Drakee4315f52000-05-09 19:53:39 +00008517 /* Init the implementation */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008518 unicode_freelist = NULL;
8519 unicode_freelist_size = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008520 unicode_empty = _PyUnicode_New(0);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008521 if (!unicode_empty)
8522 return;
8523
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008524 for (i = 0; i < 256; i++)
8525 unicode_latin1[i] = NULL;
Guido van Rossumcacfc072002-05-24 19:01:59 +00008526 if (PyType_Ready(&PyUnicode_Type) < 0)
8527 Py_FatalError("Can't initialize 'unicode'");
Thomas Wouters477c8d52006-05-27 19:21:47 +00008528
8529 /* initialize the linebreak bloom filter */
8530 bloom_linebreak = make_bloom_mask(
8531 linebreak, sizeof(linebreak) / sizeof(linebreak[0])
8532 );
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008533
8534 PyType_Ready(&EncodingMapType);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008535}
8536
8537/* Finalize the Unicode implementation */
8538
8539void
Thomas Wouters78890102000-07-22 19:25:51 +00008540_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008541{
Barry Warsaw5b4c2282000-10-03 20:45:26 +00008542 PyUnicodeObject *u;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008543 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008544
Guido van Rossum4ae8ef82000-10-03 18:09:04 +00008545 Py_XDECREF(unicode_empty);
8546 unicode_empty = NULL;
Barry Warsaw5b4c2282000-10-03 20:45:26 +00008547
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008548 for (i = 0; i < 256; i++) {
8549 if (unicode_latin1[i]) {
8550 Py_DECREF(unicode_latin1[i]);
8551 unicode_latin1[i] = NULL;
8552 }
8553 }
8554
Barry Warsaw5b4c2282000-10-03 20:45:26 +00008555 for (u = unicode_freelist; u != NULL;) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00008556 PyUnicodeObject *v = u;
8557 u = *(PyUnicodeObject **)u;
Guido van Rossumfd4b9572000-04-10 13:51:10 +00008558 if (v->str)
Guido van Rossumb18618d2000-05-03 23:44:39 +00008559 PyMem_DEL(v->str);
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00008560 Py_XDECREF(v->defenc);
Neil Schemenauer58aa8612002-04-12 03:07:20 +00008561 PyObject_Del(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008562 }
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008563 unicode_freelist = NULL;
8564 unicode_freelist_size = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008565}
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00008566
Guido van Rossum50e9fb92006-08-17 05:42:55 +00008567
8568
8569/********************* Unicode Iterator **************************/
8570
8571typedef struct {
8572 PyObject_HEAD
Guido van Rossum49d6b072006-08-17 21:11:47 +00008573 Py_ssize_t it_index;
Guido van Rossum50e9fb92006-08-17 05:42:55 +00008574 PyUnicodeObject *it_seq; /* Set to NULL when iterator is exhausted */
8575} unicodeiterobject;
8576
8577static void
8578unicodeiter_dealloc(unicodeiterobject *it)
8579{
8580 _PyObject_GC_UNTRACK(it);
8581 Py_XDECREF(it->it_seq);
8582 PyObject_GC_Del(it);
8583}
8584
8585static int
8586unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
8587{
8588 Py_VISIT(it->it_seq);
8589 return 0;
8590}
8591
8592static PyObject *
8593unicodeiter_next(unicodeiterobject *it)
8594{
8595 PyUnicodeObject *seq;
8596 PyObject *item;
8597
8598 assert(it != NULL);
8599 seq = it->it_seq;
8600 if (seq == NULL)
8601 return NULL;
8602 assert(PyUnicode_Check(seq));
8603
8604 if (it->it_index < PyUnicode_GET_SIZE(seq)) {
Guido van Rossum49d6b072006-08-17 21:11:47 +00008605 item = PyUnicode_FromUnicode(
8606 PyUnicode_AS_UNICODE(seq)+it->it_index, 1);
Guido van Rossum50e9fb92006-08-17 05:42:55 +00008607 if (item != NULL)
8608 ++it->it_index;
8609 return item;
8610 }
8611
8612 Py_DECREF(seq);
8613 it->it_seq = NULL;
8614 return NULL;
8615}
8616
8617static PyObject *
8618unicodeiter_len(unicodeiterobject *it)
8619{
8620 Py_ssize_t len = 0;
8621 if (it->it_seq)
8622 len = PyUnicode_GET_SIZE(it->it_seq) - it->it_index;
8623 return PyInt_FromSsize_t(len);
8624}
8625
8626PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
8627
8628static PyMethodDef unicodeiter_methods[] = {
Guido van Rossum49d6b072006-08-17 21:11:47 +00008629 {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
8630 length_hint_doc},
Guido van Rossum50e9fb92006-08-17 05:42:55 +00008631 {NULL, NULL} /* sentinel */
8632};
8633
8634PyTypeObject PyUnicodeIter_Type = {
8635 PyObject_HEAD_INIT(&PyType_Type)
8636 0, /* ob_size */
8637 "unicodeiterator", /* tp_name */
8638 sizeof(unicodeiterobject), /* tp_basicsize */
8639 0, /* tp_itemsize */
8640 /* methods */
Guido van Rossum49d6b072006-08-17 21:11:47 +00008641 (destructor)unicodeiter_dealloc, /* tp_dealloc */
Guido van Rossum50e9fb92006-08-17 05:42:55 +00008642 0, /* tp_print */
8643 0, /* tp_getattr */
8644 0, /* tp_setattr */
8645 0, /* tp_compare */
8646 0, /* tp_repr */
8647 0, /* tp_as_number */
8648 0, /* tp_as_sequence */
8649 0, /* tp_as_mapping */
8650 0, /* tp_hash */
8651 0, /* tp_call */
8652 0, /* tp_str */
8653 PyObject_GenericGetAttr, /* tp_getattro */
8654 0, /* tp_setattro */
8655 0, /* tp_as_buffer */
8656 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
8657 0, /* tp_doc */
8658 (traverseproc)unicodeiter_traverse, /* tp_traverse */
8659 0, /* tp_clear */
8660 0, /* tp_richcompare */
8661 0, /* tp_weaklistoffset */
8662 PyObject_SelfIter, /* tp_iter */
8663 (iternextfunc)unicodeiter_next, /* tp_iternext */
8664 unicodeiter_methods, /* tp_methods */
8665 0,
8666};
8667
8668static PyObject *
8669unicode_iter(PyObject *seq)
8670{
8671 unicodeiterobject *it;
8672
8673 if (!PyUnicode_Check(seq)) {
8674 PyErr_BadInternalCall();
8675 return NULL;
8676 }
8677 it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
8678 if (it == NULL)
8679 return NULL;
8680 it->it_index = 0;
8681 Py_INCREF(seq);
8682 it->it_seq = (PyUnicodeObject *)seq;
8683 _PyObject_GC_TRACK(it);
8684 return (PyObject *)it;
8685}
8686
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008687#ifdef __cplusplus
8688}
8689#endif
8690
8691
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00008692/*
8693Local variables:
8694c-basic-offset: 4
8695indent-tabs-mode: nil
8696End:
8697*/