blob: f2c5afa12563bacc6fe5d67c85da56b1f0b421c3 [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Fred Drake785d14f2000-05-09 19:54:43 +00004modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
Guido van Rossumd57fd912000-03-10 22:53:23 +00005Unicode Integration Proposal (see file Misc/unicode.txt).
6
Thomas Wouters477c8d52006-05-27 19:21:47 +00007Major speed upgrades to the method implementations at the Reykjavik
8NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
9
Guido van Rossum16b1ad92000-08-03 16:24:25 +000010Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000011
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000012--------------------------------------------------------------------
13The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000014
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000015 Copyright (c) 1999 by Secret Labs AB
16 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000017
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000018By obtaining, using, and/or copying this software and/or its
19associated documentation, you agree that you have read, understood,
20and will comply with the following terms and conditions:
21
22Permission to use, copy, modify, and distribute this software and its
23associated documentation for any purpose and without fee is hereby
24granted, provided that the above copyright notice appears in all
25copies, and that both that copyright notice and this permission notice
26appear in supporting documentation, and that the name of Secret Labs
27AB or the author not be used in advertising or publicity pertaining to
28distribution of the software without specific, written prior
29permission.
30
31SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
32THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
33FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
34ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
35WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
36ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
37OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
38--------------------------------------------------------------------
39
40*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000041
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000042#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000043#include "Python.h"
44
Guido van Rossumd57fd912000-03-10 22:53:23 +000045#include "unicodeobject.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000046#include "ucnhash.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000047
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000048#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000049#include <windows.h>
50#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000051
Guido van Rossumd57fd912000-03-10 22:53:23 +000052/* Limit for the Unicode object free list */
53
54#define MAX_UNICODE_FREELIST_SIZE 1024
55
56/* Limit for the Unicode object free list stay alive optimization.
57
58 The implementation will keep allocated Unicode memory intact for
59 all objects on the free list having a size less than this
Tim Petersced69f82003-09-16 20:30:58 +000060 limit. This reduces malloc() overhead for small Unicode objects.
Guido van Rossumd57fd912000-03-10 22:53:23 +000061
Barry Warsaw51ac5802000-03-20 16:36:48 +000062 At worst this will result in MAX_UNICODE_FREELIST_SIZE *
Guido van Rossumfd4b9572000-04-10 13:51:10 +000063 (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
Guido van Rossumd57fd912000-03-10 22:53:23 +000064 malloc()-overhead) bytes of unused garbage.
65
66 Setting the limit to 0 effectively turns the feature off.
67
Guido van Rossumfd4b9572000-04-10 13:51:10 +000068 Note: This is an experimental feature ! If you get core dumps when
69 using Unicode objects, turn this feature off.
Guido van Rossumd57fd912000-03-10 22:53:23 +000070
71*/
72
Guido van Rossumfd4b9572000-04-10 13:51:10 +000073#define KEEPALIVE_SIZE_LIMIT 9
Guido van Rossumd57fd912000-03-10 22:53:23 +000074
75/* Endianness switches; defaults to little endian */
76
77#ifdef WORDS_BIGENDIAN
78# define BYTEORDER_IS_BIG_ENDIAN
79#else
80# define BYTEORDER_IS_LITTLE_ENDIAN
81#endif
82
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000083/* --- Globals ------------------------------------------------------------
84
85 The globals are initialized by the _PyUnicode_Init() API and should
86 not be used before calling that API.
87
88*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000089
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000090
91#ifdef __cplusplus
92extern "C" {
93#endif
94
Guido van Rossumd57fd912000-03-10 22:53:23 +000095/* Free list for Unicode objects */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000096static PyUnicodeObject *unicode_freelist;
97static int unicode_freelist_size;
Guido van Rossumd57fd912000-03-10 22:53:23 +000098
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000099/* The empty Unicode object is shared to improve performance. */
100static PyUnicodeObject *unicode_empty;
101
102/* Single character Unicode strings in the Latin-1 range are being
103 shared as well. */
104static PyUnicodeObject *unicode_latin1[256];
105
Fred Drakee4315f52000-05-09 19:53:39 +0000106/* Default encoding to use and assume when NULL is passed as encoding
Guido van Rossumf15a29f2007-05-04 00:41:39 +0000107 parameter; it is fixed to "utf-8". Always use the
108 PyUnicode_GetDefaultEncoding() API to access this global. */
109static const char unicode_default_encoding[] = "utf-8";
Fred Drakee4315f52000-05-09 19:53:39 +0000110
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000111Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000112PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000113{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000114#ifdef Py_UNICODE_WIDE
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000115 return 0x10FFFF;
116#else
117 /* This is actually an illegal character, so it should
118 not be passed to unichr. */
119 return 0xFFFF;
120#endif
121}
122
Thomas Wouters477c8d52006-05-27 19:21:47 +0000123/* --- Bloom Filters ----------------------------------------------------- */
124
125/* stuff to implement simple "bloom filters" for Unicode characters.
126 to keep things simple, we use a single bitmask, using the least 5
127 bits from each unicode characters as the bit index. */
128
129/* the linebreak mask is set up by Unicode_Init below */
130
131#define BLOOM_MASK unsigned long
132
133static BLOOM_MASK bloom_linebreak;
134
135#define BLOOM(mask, ch) ((mask & (1 << ((ch) & 0x1F))))
136
137#define BLOOM_LINEBREAK(ch)\
138 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK((ch)))
139
140Py_LOCAL_INLINE(BLOOM_MASK) make_bloom_mask(Py_UNICODE* ptr, Py_ssize_t len)
141{
142 /* calculate simple bloom-style bitmask for a given unicode string */
143
144 long mask;
145 Py_ssize_t i;
146
147 mask = 0;
148 for (i = 0; i < len; i++)
149 mask |= (1 << (ptr[i] & 0x1F));
150
151 return mask;
152}
153
154Py_LOCAL_INLINE(int) unicode_member(Py_UNICODE chr, Py_UNICODE* set, Py_ssize_t setlen)
155{
156 Py_ssize_t i;
157
158 for (i = 0; i < setlen; i++)
159 if (set[i] == chr)
160 return 1;
161
162 return 0;
163}
164
165#define BLOOM_MEMBER(mask, chr, set, setlen)\
166 BLOOM(mask, chr) && unicode_member(chr, set, setlen)
167
Guido van Rossumd57fd912000-03-10 22:53:23 +0000168/* --- Unicode Object ----------------------------------------------------- */
169
170static
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000171int unicode_resize(register PyUnicodeObject *unicode,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000172 Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000173{
174 void *oldstr;
Tim Petersced69f82003-09-16 20:30:58 +0000175
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000176 /* Shortcut if there's nothing much to do. */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000177 if (unicode->length == length)
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000178 goto reset;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000179
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000180 /* Resizing shared object (unicode_empty or single character
181 objects) in-place is not allowed. Use PyUnicode_Resize()
182 instead ! */
Thomas Wouters477c8d52006-05-27 19:21:47 +0000183
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000184 if (unicode == unicode_empty ||
185 (unicode->length == 1 &&
186 unicode->str[0] < 256U &&
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000187 unicode_latin1[unicode->str[0]] == unicode)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000188 PyErr_SetString(PyExc_SystemError,
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000189 "can't resize shared unicode objects");
Guido van Rossumd57fd912000-03-10 22:53:23 +0000190 return -1;
191 }
192
Thomas Wouters477c8d52006-05-27 19:21:47 +0000193 /* We allocate one more byte to make sure the string is Ux0000 terminated.
194 The overallocation is also used by fastsearch, which assumes that it's
195 safe to look at str[length] (without making any assumptions about what
196 it contains). */
197
Guido van Rossumd57fd912000-03-10 22:53:23 +0000198 oldstr = unicode->str;
199 PyMem_RESIZE(unicode->str, Py_UNICODE, length + 1);
200 if (!unicode->str) {
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000201 unicode->str = (Py_UNICODE *)oldstr;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000202 PyErr_NoMemory();
203 return -1;
204 }
205 unicode->str[length] = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000206 unicode->length = length;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000207
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000208 reset:
Guido van Rossumd57fd912000-03-10 22:53:23 +0000209 /* Reset the object caches */
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000210 if (unicode->defenc) {
211 Py_DECREF(unicode->defenc);
212 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000213 }
214 unicode->hash = -1;
Tim Petersced69f82003-09-16 20:30:58 +0000215
Guido van Rossumd57fd912000-03-10 22:53:23 +0000216 return 0;
217}
218
219/* We allocate one more byte to make sure the string is
Tim Petersced69f82003-09-16 20:30:58 +0000220 Ux0000 terminated -- XXX is this needed ?
Guido van Rossumd57fd912000-03-10 22:53:23 +0000221
222 XXX This allocator could further be enhanced by assuring that the
223 free list never reduces its size below 1.
224
225*/
226
227static
Martin v. Löwis18e16552006-02-15 17:27:45 +0000228PyUnicodeObject *_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000229{
230 register PyUnicodeObject *unicode;
231
Thomas Wouters477c8d52006-05-27 19:21:47 +0000232 /* Optimization for empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000233 if (length == 0 && unicode_empty != NULL) {
234 Py_INCREF(unicode_empty);
235 return unicode_empty;
236 }
237
238 /* Unicode freelist & memory allocation */
239 if (unicode_freelist) {
240 unicode = unicode_freelist;
Marc-André Lemburgbea47e72000-06-17 20:31:17 +0000241 unicode_freelist = *(PyUnicodeObject **)unicode;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000242 unicode_freelist_size--;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000243 if (unicode->str) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000244 /* Keep-Alive optimization: we only upsize the buffer,
245 never downsize it. */
Tim Petersced69f82003-09-16 20:30:58 +0000246 if ((unicode->length < length) &&
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +0000247 unicode_resize(unicode, length) < 0) {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000248 PyMem_DEL(unicode->str);
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000249 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000250 }
251 }
Guido van Rossumad98db12001-06-14 17:52:02 +0000252 else {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000253 unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
Guido van Rossumad98db12001-06-14 17:52:02 +0000254 }
255 PyObject_INIT(unicode, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000256 }
257 else {
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000258 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000259 if (unicode == NULL)
260 return NULL;
261 unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
262 }
263
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000264 if (!unicode->str) {
265 PyErr_NoMemory();
Barry Warsaw51ac5802000-03-20 16:36:48 +0000266 goto onError;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000267 }
Jeremy Hyltond8082792003-09-16 19:41:39 +0000268 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +0000269 * the caller fails before initializing str -- unicode_resize()
270 * reads str[0], and the Keep-Alive optimization can keep memory
271 * allocated for str alive across a call to unicode_dealloc(unicode).
272 * We don't want unicode_resize to read uninitialized memory in
273 * that case.
274 */
Jeremy Hyltond8082792003-09-16 19:41:39 +0000275 unicode->str[0] = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000276 unicode->str[length] = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000277 unicode->length = length;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000278 unicode->hash = -1;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000279 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000280 return unicode;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000281
282 onError:
283 _Py_ForgetReference((PyObject *)unicode);
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000284 PyObject_Del(unicode);
Barry Warsaw51ac5802000-03-20 16:36:48 +0000285 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000286}
287
288static
Guido van Rossum9475a232001-10-05 20:51:39 +0000289void unicode_dealloc(register PyUnicodeObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000290{
Guido van Rossum604ddf82001-12-06 20:03:56 +0000291 if (PyUnicode_CheckExact(unicode) &&
292 unicode_freelist_size < MAX_UNICODE_FREELIST_SIZE) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000293 /* Keep-Alive optimization */
294 if (unicode->length >= KEEPALIVE_SIZE_LIMIT) {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000295 PyMem_DEL(unicode->str);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000296 unicode->str = NULL;
297 unicode->length = 0;
298 }
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000299 if (unicode->defenc) {
300 Py_DECREF(unicode->defenc);
301 unicode->defenc = NULL;
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000302 }
303 /* Add to free list */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000304 *(PyUnicodeObject **)unicode = unicode_freelist;
305 unicode_freelist = unicode;
306 unicode_freelist_size++;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000307 }
308 else {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000309 PyMem_DEL(unicode->str);
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000310 Py_XDECREF(unicode->defenc);
Guido van Rossum604ddf82001-12-06 20:03:56 +0000311 unicode->ob_type->tp_free((PyObject *)unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000312 }
313}
314
Martin v. Löwis18e16552006-02-15 17:27:45 +0000315int PyUnicode_Resize(PyObject **unicode, Py_ssize_t length)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000316{
317 register PyUnicodeObject *v;
318
319 /* Argument checks */
320 if (unicode == NULL) {
321 PyErr_BadInternalCall();
322 return -1;
323 }
324 v = (PyUnicodeObject *)*unicode;
Guido van Rossum049cd6b2002-10-11 00:43:48 +0000325 if (v == NULL || !PyUnicode_Check(v) || v->ob_refcnt != 1 || length < 0) {
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000326 PyErr_BadInternalCall();
327 return -1;
328 }
329
330 /* Resizing unicode_empty and single character objects is not
331 possible since these are being shared. We simply return a fresh
332 copy with the same Unicode content. */
Tim Petersced69f82003-09-16 20:30:58 +0000333 if (v->length != length &&
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000334 (v == unicode_empty || v->length == 1)) {
335 PyUnicodeObject *w = _PyUnicode_New(length);
336 if (w == NULL)
337 return -1;
338 Py_UNICODE_COPY(w->str, v->str,
339 length < v->length ? length : v->length);
Raymond Hettingerc8df5782003-03-09 07:30:43 +0000340 Py_DECREF(*unicode);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000341 *unicode = (PyObject *)w;
342 return 0;
343 }
344
345 /* Note that we don't have to modify *unicode for unshared Unicode
346 objects, since we can modify them in-place. */
347 return unicode_resize(v, length);
348}
349
350/* Internal API for use in unicodeobject.c only ! */
351#define _PyUnicode_Resize(unicodevar, length) \
352 PyUnicode_Resize(((PyObject **)(unicodevar)), length)
353
Guido van Rossumd57fd912000-03-10 22:53:23 +0000354PyObject *PyUnicode_FromUnicode(const Py_UNICODE *u,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000355 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000356{
357 PyUnicodeObject *unicode;
358
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000359 /* If the Unicode data is known at construction time, we can apply
360 some optimizations which share commonly used objects. */
361 if (u != NULL) {
362
363 /* Optimization for empty strings */
364 if (size == 0 && unicode_empty != NULL) {
365 Py_INCREF(unicode_empty);
366 return (PyObject *)unicode_empty;
367 }
368
369 /* Single character Unicode objects in the Latin-1 range are
370 shared when using this constructor */
371 if (size == 1 && *u < 256) {
372 unicode = unicode_latin1[*u];
373 if (!unicode) {
374 unicode = _PyUnicode_New(1);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000375 if (!unicode)
376 return NULL;
Marc-André Lemburg8879a332001-06-07 12:26:56 +0000377 unicode->str[0] = *u;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000378 unicode_latin1[*u] = unicode;
379 }
380 Py_INCREF(unicode);
381 return (PyObject *)unicode;
382 }
383 }
Tim Petersced69f82003-09-16 20:30:58 +0000384
Guido van Rossumd57fd912000-03-10 22:53:23 +0000385 unicode = _PyUnicode_New(size);
386 if (!unicode)
387 return NULL;
388
389 /* Copy the Unicode data into the new object */
390 if (u != NULL)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000391 Py_UNICODE_COPY(unicode->str, u, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000392
393 return (PyObject *)unicode;
394}
395
Walter Dörwaldd2034312007-05-18 16:29:38 +0000396PyObject *PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000397{
398 PyUnicodeObject *unicode;
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000399 /* If the Unicode data is known at construction time, we can apply
400 some optimizations which share commonly used objects. */
401 if (u != NULL) {
402
403 /* Optimization for empty strings */
404 if (size == 0 && unicode_empty != NULL) {
405 Py_INCREF(unicode_empty);
406 return (PyObject *)unicode_empty;
407 }
408
Walter Dörwald071b9da2007-05-05 14:21:20 +0000409 /* Single characters are shared when using this constructor */
410 if (size == 1) {
Walter Dörwaldce32db32007-05-05 14:26:59 +0000411 unicode = unicode_latin1[(int)*u];
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000412 if (!unicode) {
413 unicode = _PyUnicode_New(1);
414 if (!unicode)
415 return NULL;
416 unicode->str[0] = *u;
Walter Dörwaldce32db32007-05-05 14:26:59 +0000417 unicode_latin1[(int)*u] = unicode;
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000418 }
419 Py_INCREF(unicode);
420 return (PyObject *)unicode;
421 }
422 }
423
Walter Dörwald55507312007-05-18 13:12:10 +0000424 unicode = _PyUnicode_New(size);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000425 if (!unicode)
426 return NULL;
427
428 /* Copy the Unicode data into the new object */
429 if (u != NULL) {
Walter Dörwaldce32db32007-05-05 14:26:59 +0000430 Py_UNICODE *p = unicode->str;
431 while ((*p++ = *u++))
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000432 ;
433 }
434
435 return (PyObject *)unicode;
436}
437
Walter Dörwaldd2034312007-05-18 16:29:38 +0000438PyObject *PyUnicode_FromString(const char *u)
439{
440 size_t size = strlen(u);
441 if (size > PY_SSIZE_T_MAX) {
442 PyErr_SetString(PyExc_OverflowError, "input too long");
443 return NULL;
444 }
445
446 return PyUnicode_FromStringAndSize(u, size);
447}
448
Guido van Rossumd57fd912000-03-10 22:53:23 +0000449#ifdef HAVE_WCHAR_H
450
451PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000452 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000453{
454 PyUnicodeObject *unicode;
455
456 if (w == NULL) {
457 PyErr_BadInternalCall();
458 return NULL;
459 }
460
461 unicode = _PyUnicode_New(size);
462 if (!unicode)
463 return NULL;
464
465 /* Copy the wchar_t data into the new object */
466#ifdef HAVE_USABLE_WCHAR_T
467 memcpy(unicode->str, w, size * sizeof(wchar_t));
Tim Petersced69f82003-09-16 20:30:58 +0000468#else
Guido van Rossumd57fd912000-03-10 22:53:23 +0000469 {
470 register Py_UNICODE *u;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000471 register Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000472 u = PyUnicode_AS_UNICODE(unicode);
Marc-André Lemburg204bd6d2004-10-15 07:45:05 +0000473 for (i = size; i > 0; i--)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000474 *u++ = *w++;
475 }
476#endif
477
478 return (PyObject *)unicode;
479}
480
Walter Dörwaldd2034312007-05-18 16:29:38 +0000481#define appendstring(string) {for (copy = string;*copy;) *s++ = *copy++;}
482
483PyObject *
484PyUnicode_FromFormatV(const char *format, va_list vargs)
485{
486 va_list count;
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000487 Py_ssize_t callcount = 0;
488 PyObject **callresults = NULL;
489 PyObject **callresult;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000490 Py_ssize_t n = 0;
491 const char* f;
492 Py_UNICODE *s;
493 PyObject *string;
494 /* used by sprintf */
495 char buffer[21];
496 const char *copy;
497
498#ifdef VA_LIST_IS_ARRAY
499 Py_MEMCPY(count, vargs, sizeof(va_list));
500#else
501#ifdef __va_copy
502 __va_copy(count, vargs);
503#else
504 count = vargs;
505#endif
506#endif
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000507 /* step 1: count the number of %R format specifications
508 * (we call PyObject_Repr() for these objects once during step 3
509 * and put the result in an array) */
510 for (f = format; *f; f++) {
511 if (*f == '%' && *(f+1)=='R')
512 ++callcount;
513 }
514 /* step 2: allocate memory for the results of PyObject_Repr() calls */
515 if (callcount) {
516 callresults = PyMem_Malloc(sizeof(PyObject *)*callcount);
517 if (!callresults) {
518 PyErr_NoMemory();
519 return NULL;
520 }
521 callresult = callresults;
522 }
523 /* step 3: figure out how large a buffer we need */
Walter Dörwaldd2034312007-05-18 16:29:38 +0000524 for (f = format; *f; f++) {
525 if (*f == '%') {
526 const char* p = f;
527 while (*++f && *f != '%' && !isalpha(Py_CHARMASK(*f)))
528 ;
529
530 /* skip the 'l' or 'z' in {%ld, %zd, %lu, %zu} since
531 * they don't affect the amount of space we reserve.
532 */
533 if ((*f == 'l' || *f == 'z') &&
534 (f[1] == 'd' || f[1] == 'u'))
535 ++f;
536
537 switch (*f) {
538 case 'c':
539 (void)va_arg(count, int);
540 /* fall through... */
541 case '%':
542 n++;
543 break;
544 case 'd': case 'u': case 'i': case 'x':
545 (void) va_arg(count, int);
546 /* 20 bytes is enough to hold a 64-bit
547 integer. Decimal takes the most space.
548 This isn't enough for octal. */
549 n += 20;
550 break;
551 case 's':
552 n += strlen(va_arg(count, char*));
553 break;
554 case 'U':
555 {
556 PyObject *obj = va_arg(count, PyObject *);
557 assert(obj && PyUnicode_Check(obj));
558 n += PyUnicode_GET_SIZE(obj);
559 break;
560 }
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000561 case 'R':
562 {
563 PyObject *obj = va_arg(count, PyObject *);
564 PyObject *repr;
565 assert(obj);
566 repr = PyObject_Repr(obj);
567 if (!repr)
568 goto fail;
569 n += PyUnicode_GET_SIZE(repr);
570 /* Remember the repr and switch to the next slot */
571 *callresult++ = repr;
572 break;
573 }
Walter Dörwaldd2034312007-05-18 16:29:38 +0000574 case 'p':
575 (void) va_arg(count, int);
576 /* maximum 64-bit pointer representation:
577 * 0xffffffffffffffff
578 * so 19 characters is enough.
579 * XXX I count 18 -- what's the extra for?
580 */
581 n += 19;
582 break;
583 default:
584 /* if we stumble upon an unknown
585 formatting code, copy the rest of
586 the format string to the output
587 string. (we cannot just skip the
588 code, since there's no way to know
589 what's in the argument list) */
590 n += strlen(p);
591 goto expand;
592 }
593 } else
594 n++;
595 }
596 expand:
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000597 /* step 4: fill the buffer */
Walter Dörwaldd2034312007-05-18 16:29:38 +0000598 /* Since we've analyzed how much space we need for the worst case,
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000599 we don't have to resize the string.
600 There can be no errors beyond this point. */
Walter Dörwaldd2034312007-05-18 16:29:38 +0000601 string = PyUnicode_FromUnicode(NULL, n);
602 if (!string)
603 return NULL;
604
605 s = PyUnicode_AS_UNICODE(string);
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000606 callresult = callresults;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000607
608 for (f = format; *f; f++) {
609 if (*f == '%') {
610 const char* p = f++;
611 int longflag = 0;
612 int size_tflag = 0;
613 /* parse the width.precision part (we're only
614 interested in the precision value, if any) */
615 n = 0;
616 while (isdigit(Py_CHARMASK(*f)))
617 n = (n*10) + *f++ - '0';
618 if (*f == '.') {
619 f++;
620 n = 0;
621 while (isdigit(Py_CHARMASK(*f)))
622 n = (n*10) + *f++ - '0';
623 }
624 while (*f && *f != '%' && !isalpha(Py_CHARMASK(*f)))
625 f++;
626 /* handle the long flag, but only for %ld and %lu.
627 others can be added when necessary. */
628 if (*f == 'l' && (f[1] == 'd' || f[1] == 'u')) {
629 longflag = 1;
630 ++f;
631 }
632 /* handle the size_t flag. */
633 if (*f == 'z' && (f[1] == 'd' || f[1] == 'u')) {
634 size_tflag = 1;
635 ++f;
636 }
637
638 switch (*f) {
639 case 'c':
640 *s++ = va_arg(vargs, int);
641 break;
642 case 'd':
643 if (longflag)
644 sprintf(buffer, "%ld", va_arg(vargs, long));
645 else if (size_tflag)
646 sprintf(buffer, "%" PY_FORMAT_SIZE_T "d",
647 va_arg(vargs, Py_ssize_t));
648 else
649 sprintf(buffer, "%d", va_arg(vargs, int));
650 appendstring(buffer);
651 break;
652 case 'u':
653 if (longflag)
654 sprintf(buffer, "%lu",
655 va_arg(vargs, unsigned long));
656 else if (size_tflag)
657 sprintf(buffer, "%" PY_FORMAT_SIZE_T "u",
658 va_arg(vargs, size_t));
659 else
660 sprintf(buffer, "%u",
661 va_arg(vargs, unsigned int));
662 appendstring(buffer);
663 break;
664 case 'i':
665 sprintf(buffer, "%i", va_arg(vargs, int));
666 appendstring(buffer);
667 break;
668 case 'x':
669 sprintf(buffer, "%x", va_arg(vargs, int));
670 appendstring(buffer);
671 break;
672 case 's':
673 p = va_arg(vargs, char*);
674 appendstring(p);
675 break;
676 case 'U':
677 {
678 PyObject *obj = va_arg(vargs, PyObject *);
679 Py_UNICODE *ucopy = PyUnicode_AS_UNICODE(obj);
680 Py_ssize_t usize = PyUnicode_GET_SIZE(obj);
681 Py_ssize_t upos;
682 for (upos = 0; upos<usize;)
683 *s++ = ucopy[upos++];
684 break;
685 }
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000686 case 'R':
687 {
688 /* unused, since we already have the result */
689 (void) va_arg(vargs, PyObject *);
690 Py_UNICODE *ucopy = PyUnicode_AS_UNICODE(*callresult);
691 Py_ssize_t usize = PyUnicode_GET_SIZE(*callresult);
692 Py_ssize_t upos;
693 for (upos = 0; upos<usize;)
694 *s++ = ucopy[upos++];
695 /* We're done with the repr() => forget it */
696 Py_DECREF(*callresult);
697 /* switch to next repr() result */
698 ++callresult;
699 break;
700 }
Walter Dörwaldd2034312007-05-18 16:29:38 +0000701 case 'p':
702 sprintf(buffer, "%p", va_arg(vargs, void*));
703 /* %p is ill-defined: ensure leading 0x. */
704 if (buffer[1] == 'X')
705 buffer[1] = 'x';
706 else if (buffer[1] != 'x') {
707 memmove(buffer+2, buffer, strlen(buffer)+1);
708 buffer[0] = '0';
709 buffer[1] = 'x';
710 }
711 appendstring(buffer);
712 break;
713 case '%':
714 *s++ = '%';
715 break;
716 default:
717 appendstring(p);
718 goto end;
719 }
720 } else
721 *s++ = *f;
722 }
723
724 end:
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000725 if (callresults)
726 PyMem_Free(callresults);
Walter Dörwaldd2034312007-05-18 16:29:38 +0000727 _PyUnicode_Resize(&string, s - PyUnicode_AS_UNICODE(string));
728 return string;
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000729 fail:
730 if (callresults) {
731 PyObject **callresult2 = callresults;
732 while (callresult2 <= callresult) {
733 Py_DECREF(*callresult2);
734 ++callresult2;
735 }
736 PyMem_Free(callresults);
737 }
738 return NULL;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000739}
740
741#undef appendstring
742
743PyObject *
744PyUnicode_FromFormat(const char *format, ...)
745{
746 PyObject* ret;
747 va_list vargs;
748
749#ifdef HAVE_STDARG_PROTOTYPES
750 va_start(vargs, format);
751#else
752 va_start(vargs);
753#endif
754 ret = PyUnicode_FromFormatV(format, vargs);
755 va_end(vargs);
756 return ret;
757}
758
Martin v. Löwis18e16552006-02-15 17:27:45 +0000759Py_ssize_t PyUnicode_AsWideChar(PyUnicodeObject *unicode,
760 wchar_t *w,
761 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000762{
763 if (unicode == NULL) {
764 PyErr_BadInternalCall();
765 return -1;
766 }
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +0000767
768 /* If possible, try to copy the 0-termination as well */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000769 if (size > PyUnicode_GET_SIZE(unicode))
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +0000770 size = PyUnicode_GET_SIZE(unicode) + 1;
771
Guido van Rossumd57fd912000-03-10 22:53:23 +0000772#ifdef HAVE_USABLE_WCHAR_T
773 memcpy(w, unicode->str, size * sizeof(wchar_t));
774#else
775 {
776 register Py_UNICODE *u;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000777 register Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000778 u = PyUnicode_AS_UNICODE(unicode);
Marc-André Lemburg204bd6d2004-10-15 07:45:05 +0000779 for (i = size; i > 0; i--)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000780 *w++ = *u++;
781 }
782#endif
783
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +0000784 if (size > PyUnicode_GET_SIZE(unicode))
785 return PyUnicode_GET_SIZE(unicode);
786 else
Guido van Rossumd57fd912000-03-10 22:53:23 +0000787 return size;
788}
789
790#endif
791
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000792PyObject *PyUnicode_FromOrdinal(int ordinal)
793{
Hye-Shik Chang40574832004-04-06 07:24:51 +0000794 Py_UNICODE s[1];
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000795
796#ifdef Py_UNICODE_WIDE
797 if (ordinal < 0 || ordinal > 0x10ffff) {
798 PyErr_SetString(PyExc_ValueError,
799 "unichr() arg not in range(0x110000) "
800 "(wide Python build)");
801 return NULL;
802 }
803#else
804 if (ordinal < 0 || ordinal > 0xffff) {
805 PyErr_SetString(PyExc_ValueError,
806 "unichr() arg not in range(0x10000) "
807 "(narrow Python build)");
808 return NULL;
809 }
810#endif
811
Hye-Shik Chang40574832004-04-06 07:24:51 +0000812 s[0] = (Py_UNICODE)ordinal;
813 return PyUnicode_FromUnicode(s, 1);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000814}
815
Guido van Rossumd57fd912000-03-10 22:53:23 +0000816PyObject *PyUnicode_FromObject(register PyObject *obj)
817{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000818 /* XXX Perhaps we should make this API an alias of
819 PyObject_Unicode() instead ?! */
820 if (PyUnicode_CheckExact(obj)) {
821 Py_INCREF(obj);
822 return obj;
823 }
824 if (PyUnicode_Check(obj)) {
825 /* For a Unicode subtype that's not a Unicode object,
826 return a true Unicode object with the same data. */
827 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(obj),
828 PyUnicode_GET_SIZE(obj));
829 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000830 return PyUnicode_FromEncodedObject(obj, NULL, "strict");
831}
832
833PyObject *PyUnicode_FromEncodedObject(register PyObject *obj,
834 const char *encoding,
835 const char *errors)
836{
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000837 const char *s = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +0000838 Py_ssize_t len;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000839 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +0000840
Guido van Rossumd57fd912000-03-10 22:53:23 +0000841 if (obj == NULL) {
842 PyErr_BadInternalCall();
843 return NULL;
844 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000845
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000846#if 0
847 /* For b/w compatibility we also accept Unicode objects provided
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +0000848 that no encodings is given and then redirect to
849 PyObject_Unicode() which then applies the additional logic for
850 Unicode subclasses.
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000851
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000852 NOTE: This API should really only be used for object which
853 represent *encoded* Unicode !
854
855 */
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000856 if (PyUnicode_Check(obj)) {
857 if (encoding) {
858 PyErr_SetString(PyExc_TypeError,
859 "decoding Unicode is not supported");
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000860 return NULL;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000861 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000862 return PyObject_Unicode(obj);
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000863 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000864#else
865 if (PyUnicode_Check(obj)) {
866 PyErr_SetString(PyExc_TypeError,
867 "decoding Unicode is not supported");
868 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000869 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000870#endif
871
872 /* Coerce object */
873 if (PyString_Check(obj)) {
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000874 s = PyString_AS_STRING(obj);
875 len = PyString_GET_SIZE(obj);
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000876 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000877 else if (PyObject_AsCharBuffer(obj, &s, &len)) {
878 /* Overwrite the error message with something more useful in
879 case of a TypeError. */
880 if (PyErr_ExceptionMatches(PyExc_TypeError))
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000881 PyErr_Format(PyExc_TypeError,
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000882 "coercing to Unicode: need string or buffer, "
883 "%.80s found",
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000884 obj->ob_type->tp_name);
885 goto onError;
886 }
Tim Petersced69f82003-09-16 20:30:58 +0000887
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000888 /* Convert to Unicode */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000889 if (len == 0) {
890 Py_INCREF(unicode_empty);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000891 v = (PyObject *)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000892 }
Tim Petersced69f82003-09-16 20:30:58 +0000893 else
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000894 v = PyUnicode_Decode(s, len, encoding, errors);
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +0000895
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000896 return v;
897
898 onError:
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000899 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000900}
901
902PyObject *PyUnicode_Decode(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000903 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +0000904 const char *encoding,
905 const char *errors)
906{
907 PyObject *buffer = NULL, *unicode;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000908
909 if (encoding == NULL)
Fred Drakee4315f52000-05-09 19:53:39 +0000910 encoding = PyUnicode_GetDefaultEncoding();
911
912 /* Shortcuts for common default encodings */
913 if (strcmp(encoding, "utf-8") == 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000914 return PyUnicode_DecodeUTF8(s, size, errors);
Fred Drakee4315f52000-05-09 19:53:39 +0000915 else if (strcmp(encoding, "latin-1") == 0)
916 return PyUnicode_DecodeLatin1(s, size, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +0000917#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
918 else if (strcmp(encoding, "mbcs") == 0)
919 return PyUnicode_DecodeMBCS(s, size, errors);
920#endif
Fred Drakee4315f52000-05-09 19:53:39 +0000921 else if (strcmp(encoding, "ascii") == 0)
922 return PyUnicode_DecodeASCII(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000923
924 /* Decode via the codec registry */
925 buffer = PyBuffer_FromMemory((void *)s, size);
926 if (buffer == NULL)
927 goto onError;
928 unicode = PyCodec_Decode(buffer, encoding, errors);
929 if (unicode == NULL)
930 goto onError;
931 if (!PyUnicode_Check(unicode)) {
932 PyErr_Format(PyExc_TypeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000933 "decoder did not return an unicode object (type=%.400s)",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000934 unicode->ob_type->tp_name);
935 Py_DECREF(unicode);
936 goto onError;
937 }
938 Py_DECREF(buffer);
939 return unicode;
Tim Petersced69f82003-09-16 20:30:58 +0000940
Guido van Rossumd57fd912000-03-10 22:53:23 +0000941 onError:
942 Py_XDECREF(buffer);
943 return NULL;
944}
945
Marc-André Lemburgd2d45982004-07-08 17:57:32 +0000946PyObject *PyUnicode_AsDecodedObject(PyObject *unicode,
947 const char *encoding,
948 const char *errors)
949{
950 PyObject *v;
951
952 if (!PyUnicode_Check(unicode)) {
953 PyErr_BadArgument();
954 goto onError;
955 }
956
957 if (encoding == NULL)
958 encoding = PyUnicode_GetDefaultEncoding();
959
960 /* Decode via the codec registry */
961 v = PyCodec_Decode(unicode, encoding, errors);
962 if (v == NULL)
963 goto onError;
964 return v;
965
966 onError:
967 return NULL;
968}
969
Guido van Rossumd57fd912000-03-10 22:53:23 +0000970PyObject *PyUnicode_Encode(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000971 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +0000972 const char *encoding,
973 const char *errors)
974{
975 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +0000976
Guido van Rossumd57fd912000-03-10 22:53:23 +0000977 unicode = PyUnicode_FromUnicode(s, size);
978 if (unicode == NULL)
979 return NULL;
980 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
981 Py_DECREF(unicode);
982 return v;
983}
984
Marc-André Lemburgd2d45982004-07-08 17:57:32 +0000985PyObject *PyUnicode_AsEncodedObject(PyObject *unicode,
986 const char *encoding,
987 const char *errors)
988{
989 PyObject *v;
990
991 if (!PyUnicode_Check(unicode)) {
992 PyErr_BadArgument();
993 goto onError;
994 }
995
996 if (encoding == NULL)
997 encoding = PyUnicode_GetDefaultEncoding();
998
999 /* Encode via the codec registry */
1000 v = PyCodec_Encode(unicode, encoding, errors);
1001 if (v == NULL)
1002 goto onError;
1003 return v;
1004
1005 onError:
1006 return NULL;
1007}
1008
Guido van Rossumd57fd912000-03-10 22:53:23 +00001009PyObject *PyUnicode_AsEncodedString(PyObject *unicode,
1010 const char *encoding,
1011 const char *errors)
1012{
1013 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00001014
Guido van Rossumd57fd912000-03-10 22:53:23 +00001015 if (!PyUnicode_Check(unicode)) {
1016 PyErr_BadArgument();
1017 goto onError;
1018 }
Fred Drakee4315f52000-05-09 19:53:39 +00001019
Tim Petersced69f82003-09-16 20:30:58 +00001020 if (encoding == NULL)
Fred Drakee4315f52000-05-09 19:53:39 +00001021 encoding = PyUnicode_GetDefaultEncoding();
1022
1023 /* Shortcuts for common default encodings */
1024 if (errors == NULL) {
1025 if (strcmp(encoding, "utf-8") == 0)
Jeremy Hylton9cea41c2001-05-29 17:13:15 +00001026 return PyUnicode_AsUTF8String(unicode);
Fred Drakee4315f52000-05-09 19:53:39 +00001027 else if (strcmp(encoding, "latin-1") == 0)
1028 return PyUnicode_AsLatin1String(unicode);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001029#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
1030 else if (strcmp(encoding, "mbcs") == 0)
1031 return PyUnicode_AsMBCSString(unicode);
1032#endif
Fred Drakee4315f52000-05-09 19:53:39 +00001033 else if (strcmp(encoding, "ascii") == 0)
1034 return PyUnicode_AsASCIIString(unicode);
1035 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001036
1037 /* Encode via the codec registry */
1038 v = PyCodec_Encode(unicode, encoding, errors);
1039 if (v == NULL)
1040 goto onError;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001041 if (!PyBytes_Check(v)) {
1042 if (PyString_Check(v)) {
1043 /* Old codec, turn it into bytes */
1044 PyObject *b = PyBytes_FromObject(v);
1045 Py_DECREF(v);
1046 return b;
1047 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001048 PyErr_Format(PyExc_TypeError,
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001049 "encoder did not return a bytes object "
1050 "(type=%.400s, encoding=%.20s, errors=%.20s)",
1051 v->ob_type->tp_name,
1052 encoding ? encoding : "NULL",
1053 errors ? errors : "NULL");
Guido van Rossumd57fd912000-03-10 22:53:23 +00001054 Py_DECREF(v);
1055 goto onError;
1056 }
1057 return v;
Tim Petersced69f82003-09-16 20:30:58 +00001058
Guido van Rossumd57fd912000-03-10 22:53:23 +00001059 onError:
1060 return NULL;
1061}
1062
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001063PyObject *_PyUnicode_AsDefaultEncodedString(PyObject *unicode,
1064 const char *errors)
1065{
1066 PyObject *v = ((PyUnicodeObject *)unicode)->defenc;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001067 PyObject *b;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001068 if (v)
1069 return v;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001070 if (errors != NULL)
1071 Py_FatalError("non-NULL encoding in _PyUnicode_AsDefaultEncodedString");
1072 if (errors == NULL) {
1073 b = PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
1074 PyUnicode_GET_SIZE(unicode),
1075 NULL);
1076 }
1077 else {
1078 b = PyUnicode_AsEncodedString(unicode, NULL, errors);
1079 }
1080 if (!b)
1081 return NULL;
1082 v = PyString_FromStringAndSize(PyBytes_AsString(b),
1083 PyBytes_Size(b));
1084 Py_DECREF(b);
1085 if (!errors) {
1086 Py_XINCREF(v);
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001087 ((PyUnicodeObject *)unicode)->defenc = v;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001088 }
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001089 return v;
1090}
1091
Guido van Rossumd57fd912000-03-10 22:53:23 +00001092Py_UNICODE *PyUnicode_AsUnicode(PyObject *unicode)
1093{
1094 if (!PyUnicode_Check(unicode)) {
1095 PyErr_BadArgument();
1096 goto onError;
1097 }
1098 return PyUnicode_AS_UNICODE(unicode);
1099
1100 onError:
1101 return NULL;
1102}
1103
Martin v. Löwis18e16552006-02-15 17:27:45 +00001104Py_ssize_t PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001105{
1106 if (!PyUnicode_Check(unicode)) {
1107 PyErr_BadArgument();
1108 goto onError;
1109 }
1110 return PyUnicode_GET_SIZE(unicode);
1111
1112 onError:
1113 return -1;
1114}
1115
Thomas Wouters78890102000-07-22 19:25:51 +00001116const char *PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +00001117{
1118 return unicode_default_encoding;
1119}
1120
1121int PyUnicode_SetDefaultEncoding(const char *encoding)
1122{
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001123 if (strcmp(encoding, unicode_default_encoding) != 0) {
1124 PyErr_Format(PyExc_ValueError,
1125 "Can only set default encoding to %s",
1126 unicode_default_encoding);
1127 return -1;
1128 }
Fred Drakee4315f52000-05-09 19:53:39 +00001129 return 0;
Fred Drakee4315f52000-05-09 19:53:39 +00001130}
1131
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001132/* error handling callback helper:
1133 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +00001134 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001135 and adjust various state variables.
1136 return 0 on success, -1 on error
1137*/
1138
1139static
1140int unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
1141 const char *encoding, const char *reason,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001142 const char *input, Py_ssize_t insize, Py_ssize_t *startinpos, Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
1143 PyObject **output, Py_ssize_t *outpos, Py_UNICODE **outptr)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001144{
Martin v. Löwis18e16552006-02-15 17:27:45 +00001145 static char *argparse = "O!n;decoding error handler must return (unicode, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001146
1147 PyObject *restuple = NULL;
1148 PyObject *repunicode = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001149 Py_ssize_t outsize = PyUnicode_GET_SIZE(*output);
1150 Py_ssize_t requiredsize;
1151 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001152 Py_UNICODE *repptr;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001153 Py_ssize_t repsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001154 int res = -1;
1155
1156 if (*errorHandler == NULL) {
1157 *errorHandler = PyCodec_LookupError(errors);
1158 if (*errorHandler == NULL)
1159 goto onError;
1160 }
1161
1162 if (*exceptionObject == NULL) {
1163 *exceptionObject = PyUnicodeDecodeError_Create(
1164 encoding, input, insize, *startinpos, *endinpos, reason);
1165 if (*exceptionObject == NULL)
1166 goto onError;
1167 }
1168 else {
1169 if (PyUnicodeDecodeError_SetStart(*exceptionObject, *startinpos))
1170 goto onError;
1171 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, *endinpos))
1172 goto onError;
1173 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
1174 goto onError;
1175 }
1176
1177 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
1178 if (restuple == NULL)
1179 goto onError;
1180 if (!PyTuple_Check(restuple)) {
1181 PyErr_Format(PyExc_TypeError, &argparse[4]);
1182 goto onError;
1183 }
1184 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
1185 goto onError;
1186 if (newpos<0)
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00001187 newpos = insize+newpos;
1188 if (newpos<0 || newpos>insize) {
Martin v. Löwis2c95cc62006-02-16 06:54:25 +00001189 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00001190 goto onError;
1191 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001192
1193 /* need more space? (at least enough for what we
1194 have+the replacement+the rest of the string (starting
1195 at the new input position), so we won't have to check space
1196 when there are no errors in the rest of the string) */
1197 repptr = PyUnicode_AS_UNICODE(repunicode);
1198 repsize = PyUnicode_GET_SIZE(repunicode);
1199 requiredsize = *outpos + repsize + insize-newpos;
1200 if (requiredsize > outsize) {
1201 if (requiredsize<2*outsize)
1202 requiredsize = 2*outsize;
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00001203 if (PyUnicode_Resize(output, requiredsize) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001204 goto onError;
1205 *outptr = PyUnicode_AS_UNICODE(*output) + *outpos;
1206 }
1207 *endinpos = newpos;
1208 *inptr = input + newpos;
1209 Py_UNICODE_COPY(*outptr, repptr, repsize);
1210 *outptr += repsize;
1211 *outpos += repsize;
1212 /* we made it! */
1213 res = 0;
1214
1215 onError:
1216 Py_XDECREF(restuple);
1217 return res;
1218}
1219
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001220/* --- UTF-7 Codec -------------------------------------------------------- */
1221
1222/* see RFC2152 for details */
1223
Tim Petersced69f82003-09-16 20:30:58 +00001224static
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001225char utf7_special[128] = {
1226 /* indicate whether a UTF-7 character is special i.e. cannot be directly
1227 encoded:
1228 0 - not special
1229 1 - special
1230 2 - whitespace (optional)
1231 3 - RFC2152 Set O (optional) */
1232 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 2, 1, 1,
1233 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1234 2, 3, 3, 3, 3, 3, 3, 0, 0, 0, 3, 1, 0, 0, 0, 1,
1235 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 0,
1236 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1237 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 1, 3, 3, 3,
1238 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1239 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 1, 1,
1240
1241};
1242
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001243/* Note: The comparison (c) <= 0 is a trick to work-around gcc
1244 warnings about the comparison always being false; since
1245 utf7_special[0] is 1, we can safely make that one comparison
1246 true */
1247
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001248#define SPECIAL(c, encodeO, encodeWS) \
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001249 ((c) > 127 || (c) <= 0 || utf7_special[(c)] == 1 || \
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +00001250 (encodeWS && (utf7_special[(c)] == 2)) || \
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001251 (encodeO && (utf7_special[(c)] == 3)))
1252
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001253#define B64(n) \
1254 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
1255#define B64CHAR(c) \
1256 (isalnum(c) || (c) == '+' || (c) == '/')
1257#define UB64(c) \
1258 ((c) == '+' ? 62 : (c) == '/' ? 63 : (c) >= 'a' ? \
1259 (c) - 71 : (c) >= 'A' ? (c) - 65 : (c) + 4 )
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001260
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +00001261#define ENCODE(out, ch, bits) \
1262 while (bits >= 6) { \
1263 *out++ = B64(ch >> (bits-6)); \
1264 bits -= 6; \
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001265 }
1266
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +00001267#define DECODE(out, ch, bits, surrogate) \
1268 while (bits >= 16) { \
1269 Py_UNICODE outCh = (Py_UNICODE) ((ch >> (bits-16)) & 0xffff); \
1270 bits -= 16; \
1271 if (surrogate) { \
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001272 /* We have already generated an error for the high surrogate \
1273 so let's not bother seeing if the low surrogate is correct or not */ \
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +00001274 surrogate = 0; \
1275 } else if (0xDC00 <= outCh && outCh <= 0xDFFF) { \
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001276 /* This is a surrogate pair. Unfortunately we can't represent \
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +00001277 it in a 16-bit character */ \
1278 surrogate = 1; \
1279 errmsg = "code pairs are not supported"; \
1280 goto utf7Error; \
1281 } else { \
1282 *out++ = outCh; \
1283 } \
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001284 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001285
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001286PyObject *PyUnicode_DecodeUTF7(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001287 Py_ssize_t size,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001288 const char *errors)
1289{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001290 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001291 Py_ssize_t startinpos;
1292 Py_ssize_t endinpos;
1293 Py_ssize_t outpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001294 const char *e;
1295 PyUnicodeObject *unicode;
1296 Py_UNICODE *p;
1297 const char *errmsg = "";
1298 int inShift = 0;
1299 unsigned int bitsleft = 0;
1300 unsigned long charsleft = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001301 int surrogate = 0;
1302 PyObject *errorHandler = NULL;
1303 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001304
1305 unicode = _PyUnicode_New(size);
1306 if (!unicode)
1307 return NULL;
1308 if (size == 0)
1309 return (PyObject *)unicode;
1310
1311 p = unicode->str;
1312 e = s + size;
1313
1314 while (s < e) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001315 Py_UNICODE ch;
1316 restart:
1317 ch = *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001318
1319 if (inShift) {
1320 if ((ch == '-') || !B64CHAR(ch)) {
1321 inShift = 0;
1322 s++;
Tim Petersced69f82003-09-16 20:30:58 +00001323
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001324 /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
1325 if (bitsleft >= 6) {
1326 /* The shift sequence has a partial character in it. If
1327 bitsleft < 6 then we could just classify it as padding
1328 but that is not the case here */
1329
1330 errmsg = "partial character in shift sequence";
Tim Petersced69f82003-09-16 20:30:58 +00001331 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001332 }
1333 /* According to RFC2152 the remaining bits should be zero. We
Tim Petersced69f82003-09-16 20:30:58 +00001334 choose to signal an error/insert a replacement character
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001335 here so indicate the potential of a misencoded character. */
1336
1337 /* On x86, a << b == a << (b%32) so make sure that bitsleft != 0 */
1338 if (bitsleft && charsleft << (sizeof(charsleft) * 8 - bitsleft)) {
1339 errmsg = "non-zero padding bits in shift sequence";
Tim Petersced69f82003-09-16 20:30:58 +00001340 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001341 }
1342
1343 if (ch == '-') {
1344 if ((s < e) && (*(s) == '-')) {
Tim Petersced69f82003-09-16 20:30:58 +00001345 *p++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001346 inShift = 1;
1347 }
1348 } else if (SPECIAL(ch,0,0)) {
1349 errmsg = "unexpected special character";
Tim Petersced69f82003-09-16 20:30:58 +00001350 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001351 } else {
1352 *p++ = ch;
1353 }
1354 } else {
1355 charsleft = (charsleft << 6) | UB64(ch);
1356 bitsleft += 6;
1357 s++;
1358 /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
1359 }
1360 }
1361 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001362 startinpos = s-starts;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001363 s++;
1364 if (s < e && *s == '-') {
1365 s++;
1366 *p++ = '+';
1367 } else
1368 {
1369 inShift = 1;
1370 bitsleft = 0;
1371 }
1372 }
1373 else if (SPECIAL(ch,0,0)) {
1374 errmsg = "unexpected special character";
1375 s++;
Tim Petersced69f82003-09-16 20:30:58 +00001376 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001377 }
1378 else {
1379 *p++ = ch;
1380 s++;
1381 }
1382 continue;
1383 utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001384 outpos = p-PyUnicode_AS_UNICODE(unicode);
1385 endinpos = s-starts;
1386 if (unicode_decode_call_errorhandler(
1387 errors, &errorHandler,
1388 "utf7", errmsg,
1389 starts, size, &startinpos, &endinpos, &exc, &s,
1390 (PyObject **)&unicode, &outpos, &p))
1391 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001392 }
1393
1394 if (inShift) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001395 outpos = p-PyUnicode_AS_UNICODE(unicode);
1396 endinpos = size;
1397 if (unicode_decode_call_errorhandler(
1398 errors, &errorHandler,
1399 "utf7", "unterminated shift sequence",
1400 starts, size, &startinpos, &endinpos, &exc, &s,
1401 (PyObject **)&unicode, &outpos, &p))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001402 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001403 if (s < e)
1404 goto restart;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001405 }
1406
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00001407 if (_PyUnicode_Resize(&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001408 goto onError;
1409
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001410 Py_XDECREF(errorHandler);
1411 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001412 return (PyObject *)unicode;
1413
1414onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001415 Py_XDECREF(errorHandler);
1416 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001417 Py_DECREF(unicode);
1418 return NULL;
1419}
1420
1421
1422PyObject *PyUnicode_EncodeUTF7(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001423 Py_ssize_t size,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001424 int encodeSetO,
1425 int encodeWhiteSpace,
1426 const char *errors)
1427{
1428 PyObject *v;
1429 /* It might be possible to tighten this worst case */
Martin v. Löwis18e16552006-02-15 17:27:45 +00001430 Py_ssize_t cbAllocated = 5 * size;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001431 int inShift = 0;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001432 Py_ssize_t i = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001433 unsigned int bitsleft = 0;
1434 unsigned long charsleft = 0;
1435 char * out;
1436 char * start;
1437
1438 if (size == 0)
Walter Dörwald51ab4142007-05-05 14:43:36 +00001439 return PyBytes_FromStringAndSize(NULL, 0);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001440
Walter Dörwald51ab4142007-05-05 14:43:36 +00001441 v = PyBytes_FromStringAndSize(NULL, cbAllocated);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001442 if (v == NULL)
1443 return NULL;
1444
Walter Dörwald51ab4142007-05-05 14:43:36 +00001445 start = out = PyBytes_AS_STRING(v);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001446 for (;i < size; ++i) {
1447 Py_UNICODE ch = s[i];
1448
1449 if (!inShift) {
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001450 if (ch == '+') {
1451 *out++ = '+';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001452 *out++ = '-';
1453 } else if (SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
1454 charsleft = ch;
1455 bitsleft = 16;
1456 *out++ = '+';
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001457 /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001458 inShift = bitsleft > 0;
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001459 } else {
1460 *out++ = (char) ch;
1461 }
1462 } else {
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001463 if (!SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
1464 *out++ = B64(charsleft << (6-bitsleft));
1465 charsleft = 0;
1466 bitsleft = 0;
1467 /* Characters not in the BASE64 set implicitly unshift the sequence
1468 so no '-' is required, except if the character is itself a '-' */
1469 if (B64CHAR(ch) || ch == '-') {
1470 *out++ = '-';
1471 }
1472 inShift = 0;
1473 *out++ = (char) ch;
1474 } else {
1475 bitsleft += 16;
1476 charsleft = (charsleft << 16) | ch;
1477 /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
1478
1479 /* If the next character is special then we dont' need to terminate
Tim Petersced69f82003-09-16 20:30:58 +00001480 the shift sequence. If the next character is not a BASE64 character
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001481 or '-' then the shift sequence will be terminated implicitly and we
1482 don't have to insert a '-'. */
1483
1484 if (bitsleft == 0) {
1485 if (i + 1 < size) {
1486 Py_UNICODE ch2 = s[i+1];
1487
1488 if (SPECIAL(ch2, encodeSetO, encodeWhiteSpace)) {
Tim Petersced69f82003-09-16 20:30:58 +00001489
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001490 } else if (B64CHAR(ch2) || ch2 == '-') {
1491 *out++ = '-';
1492 inShift = 0;
1493 } else {
1494 inShift = 0;
1495 }
1496
1497 }
1498 else {
1499 *out++ = '-';
1500 inShift = 0;
1501 }
1502 }
Tim Petersced69f82003-09-16 20:30:58 +00001503 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001504 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001505 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001506 if (bitsleft) {
1507 *out++= B64(charsleft << (6-bitsleft) );
1508 *out++ = '-';
1509 }
1510
Walter Dörwald51ab4142007-05-05 14:43:36 +00001511 if (PyBytes_Resize(v, out - start)) {
1512 Py_DECREF(v);
1513 return NULL;
1514 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001515 return v;
1516}
1517
1518#undef SPECIAL
1519#undef B64
1520#undef B64CHAR
1521#undef UB64
1522#undef ENCODE
1523#undef DECODE
1524
Guido van Rossumd57fd912000-03-10 22:53:23 +00001525/* --- UTF-8 Codec -------------------------------------------------------- */
1526
Tim Petersced69f82003-09-16 20:30:58 +00001527static
Guido van Rossumd57fd912000-03-10 22:53:23 +00001528char utf8_code_length[256] = {
1529 /* Map UTF-8 encoded prefix byte to sequence length. zero means
1530 illegal prefix. see RFC 2279 for details */
1531 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1532 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1533 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1534 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1535 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1536 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1537 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1538 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1539 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1540 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1541 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1542 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1543 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1544 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1545 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
1546 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0
1547};
1548
Guido van Rossumd57fd912000-03-10 22:53:23 +00001549PyObject *PyUnicode_DecodeUTF8(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001550 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00001551 const char *errors)
1552{
Walter Dörwald69652032004-09-07 20:24:22 +00001553 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
1554}
1555
1556PyObject *PyUnicode_DecodeUTF8Stateful(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001557 Py_ssize_t size,
Walter Dörwald69652032004-09-07 20:24:22 +00001558 const char *errors,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001559 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00001560{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001561 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001562 int n;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001563 Py_ssize_t startinpos;
1564 Py_ssize_t endinpos;
1565 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001566 const char *e;
1567 PyUnicodeObject *unicode;
1568 Py_UNICODE *p;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001569 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001570 PyObject *errorHandler = NULL;
1571 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001572
1573 /* Note: size will always be longer than the resulting Unicode
1574 character count */
1575 unicode = _PyUnicode_New(size);
1576 if (!unicode)
1577 return NULL;
Walter Dörwald69652032004-09-07 20:24:22 +00001578 if (size == 0) {
1579 if (consumed)
1580 *consumed = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001581 return (PyObject *)unicode;
Walter Dörwald69652032004-09-07 20:24:22 +00001582 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001583
1584 /* Unpack UTF-8 encoded data */
1585 p = unicode->str;
1586 e = s + size;
1587
1588 while (s < e) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001589 Py_UCS4 ch = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001590
1591 if (ch < 0x80) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001592 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001593 s++;
1594 continue;
1595 }
1596
1597 n = utf8_code_length[ch];
1598
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001599 if (s + n > e) {
Walter Dörwald69652032004-09-07 20:24:22 +00001600 if (consumed)
1601 break;
1602 else {
1603 errmsg = "unexpected end of data";
1604 startinpos = s-starts;
1605 endinpos = size;
1606 goto utf8Error;
1607 }
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001608 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001609
1610 switch (n) {
1611
1612 case 0:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001613 errmsg = "unexpected code byte";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001614 startinpos = s-starts;
1615 endinpos = startinpos+1;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001616 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001617
1618 case 1:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001619 errmsg = "internal error";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001620 startinpos = s-starts;
1621 endinpos = startinpos+1;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001622 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001623
1624 case 2:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001625 if ((s[1] & 0xc0) != 0x80) {
1626 errmsg = "invalid data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001627 startinpos = s-starts;
1628 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001629 goto utf8Error;
1630 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001631 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001632 if (ch < 0x80) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001633 startinpos = s-starts;
1634 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001635 errmsg = "illegal encoding";
1636 goto utf8Error;
1637 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001638 else
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001639 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001640 break;
1641
1642 case 3:
Tim Petersced69f82003-09-16 20:30:58 +00001643 if ((s[1] & 0xc0) != 0x80 ||
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001644 (s[2] & 0xc0) != 0x80) {
1645 errmsg = "invalid data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001646 startinpos = s-starts;
1647 endinpos = startinpos+3;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001648 goto utf8Error;
1649 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001650 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001651 if (ch < 0x0800) {
1652 /* Note: UTF-8 encodings of surrogates are considered
Tim Petersced69f82003-09-16 20:30:58 +00001653 legal UTF-8 sequences;
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001654
1655 XXX For wide builds (UCS-4) we should probably try
1656 to recombine the surrogates into a single code
1657 unit.
1658 */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001659 errmsg = "illegal encoding";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001660 startinpos = s-starts;
1661 endinpos = startinpos+3;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001662 goto utf8Error;
1663 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001664 else
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001665 *p++ = (Py_UNICODE)ch;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001666 break;
1667
1668 case 4:
1669 if ((s[1] & 0xc0) != 0x80 ||
1670 (s[2] & 0xc0) != 0x80 ||
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001671 (s[3] & 0xc0) != 0x80) {
1672 errmsg = "invalid data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001673 startinpos = s-starts;
1674 endinpos = startinpos+4;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001675 goto utf8Error;
1676 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001677 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
1678 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
1679 /* validate and convert to UTF-16 */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001680 if ((ch < 0x10000) /* minimum value allowed for 4
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001681 byte encoding */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001682 || (ch > 0x10ffff)) /* maximum value allowed for
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001683 UTF-16 */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001684 {
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001685 errmsg = "illegal encoding";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001686 startinpos = s-starts;
1687 endinpos = startinpos+4;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001688 goto utf8Error;
1689 }
Fredrik Lundh8f455852001-06-27 18:59:43 +00001690#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001691 *p++ = (Py_UNICODE)ch;
1692#else
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001693 /* compute and append the two surrogates: */
Tim Petersced69f82003-09-16 20:30:58 +00001694
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001695 /* translate from 10000..10FFFF to 0..FFFF */
1696 ch -= 0x10000;
Tim Petersced69f82003-09-16 20:30:58 +00001697
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001698 /* high surrogate = top 10 bits added to D800 */
1699 *p++ = (Py_UNICODE)(0xD800 + (ch >> 10));
Tim Petersced69f82003-09-16 20:30:58 +00001700
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001701 /* low surrogate = bottom 10 bits added to DC00 */
Fredrik Lundh45714e92001-06-26 16:39:36 +00001702 *p++ = (Py_UNICODE)(0xDC00 + (ch & 0x03FF));
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001703#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00001704 break;
1705
1706 default:
1707 /* Other sizes are only needed for UCS-4 */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001708 errmsg = "unsupported Unicode code range";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001709 startinpos = s-starts;
1710 endinpos = startinpos+n;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001711 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001712 }
1713 s += n;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001714 continue;
Tim Petersced69f82003-09-16 20:30:58 +00001715
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001716 utf8Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001717 outpos = p-PyUnicode_AS_UNICODE(unicode);
1718 if (unicode_decode_call_errorhandler(
1719 errors, &errorHandler,
1720 "utf8", errmsg,
1721 starts, size, &startinpos, &endinpos, &exc, &s,
1722 (PyObject **)&unicode, &outpos, &p))
1723 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001724 }
Walter Dörwald69652032004-09-07 20:24:22 +00001725 if (consumed)
1726 *consumed = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001727
1728 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00001729 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001730 goto onError;
1731
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001732 Py_XDECREF(errorHandler);
1733 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001734 return (PyObject *)unicode;
1735
1736onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001737 Py_XDECREF(errorHandler);
1738 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001739 Py_DECREF(unicode);
1740 return NULL;
1741}
1742
Tim Peters602f7402002-04-27 18:03:26 +00001743/* Allocation strategy: if the string is short, convert into a stack buffer
1744 and allocate exactly as much space needed at the end. Else allocate the
1745 maximum possible needed (4 result bytes per Unicode character), and return
1746 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001747*/
Tim Peters7e3d9612002-04-21 03:26:37 +00001748PyObject *
1749PyUnicode_EncodeUTF8(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001750 Py_ssize_t size,
Tim Peters7e3d9612002-04-21 03:26:37 +00001751 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001752{
Tim Peters602f7402002-04-27 18:03:26 +00001753#define MAX_SHORT_UNICHARS 300 /* largest size we'll do on the stack */
Tim Peters0eca65c2002-04-21 17:28:06 +00001754
Martin v. Löwis18e16552006-02-15 17:27:45 +00001755 Py_ssize_t i; /* index into s of next input byte */
Tim Peters602f7402002-04-27 18:03:26 +00001756 PyObject *v; /* result string object */
1757 char *p; /* next free byte in output buffer */
Martin v. Löwis18e16552006-02-15 17:27:45 +00001758 Py_ssize_t nallocated; /* number of result bytes allocated */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001759 Py_ssize_t nneeded; /* number of result bytes needed */
Tim Peters602f7402002-04-27 18:03:26 +00001760 char stackbuf[MAX_SHORT_UNICHARS * 4];
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001761
Tim Peters602f7402002-04-27 18:03:26 +00001762 assert(s != NULL);
1763 assert(size >= 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001764
Tim Peters602f7402002-04-27 18:03:26 +00001765 if (size <= MAX_SHORT_UNICHARS) {
1766 /* Write into the stack buffer; nallocated can't overflow.
1767 * At the end, we'll allocate exactly as much heap space as it
1768 * turns out we need.
1769 */
1770 nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int);
1771 v = NULL; /* will allocate after we're done */
1772 p = stackbuf;
1773 }
1774 else {
1775 /* Overallocate on the heap, and give the excess back at the end. */
1776 nallocated = size * 4;
1777 if (nallocated / 4 != size) /* overflow! */
1778 return PyErr_NoMemory();
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001779 v = PyBytes_FromStringAndSize(NULL, nallocated);
Tim Peters602f7402002-04-27 18:03:26 +00001780 if (v == NULL)
1781 return NULL;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001782 p = PyBytes_AS_STRING(v);
Tim Peters602f7402002-04-27 18:03:26 +00001783 }
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001784
Tim Peters602f7402002-04-27 18:03:26 +00001785 for (i = 0; i < size;) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001786 Py_UCS4 ch = s[i++];
Marc-André Lemburg3688a882002-02-06 18:09:02 +00001787
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001788 if (ch < 0x80)
Tim Peters602f7402002-04-27 18:03:26 +00001789 /* Encode ASCII */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001790 *p++ = (char) ch;
Marc-André Lemburg3688a882002-02-06 18:09:02 +00001791
Guido van Rossumd57fd912000-03-10 22:53:23 +00001792 else if (ch < 0x0800) {
Tim Peters602f7402002-04-27 18:03:26 +00001793 /* Encode Latin-1 */
Marc-André Lemburgdc724d62002-02-06 18:20:19 +00001794 *p++ = (char)(0xc0 | (ch >> 6));
1795 *p++ = (char)(0x80 | (ch & 0x3f));
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001796 }
Marc-André Lemburg3688a882002-02-06 18:09:02 +00001797 else {
Tim Peters602f7402002-04-27 18:03:26 +00001798 /* Encode UCS2 Unicode ordinals */
1799 if (ch < 0x10000) {
1800 /* Special case: check for high surrogate */
1801 if (0xD800 <= ch && ch <= 0xDBFF && i != size) {
1802 Py_UCS4 ch2 = s[i];
1803 /* Check for low surrogate and combine the two to
1804 form a UCS4 value */
1805 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001806 ch = ((ch - 0xD800) << 10 | (ch2 - 0xDC00)) + 0x10000;
Tim Peters602f7402002-04-27 18:03:26 +00001807 i++;
1808 goto encodeUCS4;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001809 }
Tim Peters602f7402002-04-27 18:03:26 +00001810 /* Fall through: handles isolated high surrogates */
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001811 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001812 *p++ = (char)(0xe0 | (ch >> 12));
Tim Peters602f7402002-04-27 18:03:26 +00001813 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
1814 *p++ = (char)(0x80 | (ch & 0x3f));
1815 continue;
1816 }
1817encodeUCS4:
1818 /* Encode UCS4 Unicode ordinals */
1819 *p++ = (char)(0xf0 | (ch >> 18));
1820 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
1821 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
1822 *p++ = (char)(0x80 | (ch & 0x3f));
1823 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001824 }
Tim Peters0eca65c2002-04-21 17:28:06 +00001825
Tim Peters602f7402002-04-27 18:03:26 +00001826 if (v == NULL) {
1827 /* This was stack allocated. */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001828 nneeded = p - stackbuf;
Tim Peters602f7402002-04-27 18:03:26 +00001829 assert(nneeded <= nallocated);
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001830 v = PyBytes_FromStringAndSize(stackbuf, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00001831 }
1832 else {
1833 /* Cut back to size actually needed. */
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001834 nneeded = p - PyBytes_AS_STRING(v);
Tim Peters602f7402002-04-27 18:03:26 +00001835 assert(nneeded <= nallocated);
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001836 PyBytes_Resize(v, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00001837 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001838 return v;
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001839
Tim Peters602f7402002-04-27 18:03:26 +00001840#undef MAX_SHORT_UNICHARS
Guido van Rossumd57fd912000-03-10 22:53:23 +00001841}
1842
Guido van Rossumd57fd912000-03-10 22:53:23 +00001843PyObject *PyUnicode_AsUTF8String(PyObject *unicode)
1844{
Guido van Rossumd57fd912000-03-10 22:53:23 +00001845 if (!PyUnicode_Check(unicode)) {
1846 PyErr_BadArgument();
1847 return NULL;
1848 }
Barry Warsaw2dd4abf2000-08-18 06:58:15 +00001849 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
1850 PyUnicode_GET_SIZE(unicode),
1851 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001852}
1853
1854/* --- UTF-16 Codec ------------------------------------------------------- */
1855
Tim Peters772747b2001-08-09 22:21:55 +00001856PyObject *
1857PyUnicode_DecodeUTF16(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001858 Py_ssize_t size,
Tim Peters772747b2001-08-09 22:21:55 +00001859 const char *errors,
1860 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001861{
Walter Dörwald69652032004-09-07 20:24:22 +00001862 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
1863}
1864
1865PyObject *
1866PyUnicode_DecodeUTF16Stateful(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001867 Py_ssize_t size,
Walter Dörwald69652032004-09-07 20:24:22 +00001868 const char *errors,
1869 int *byteorder,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001870 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00001871{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001872 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001873 Py_ssize_t startinpos;
1874 Py_ssize_t endinpos;
1875 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001876 PyUnicodeObject *unicode;
1877 Py_UNICODE *p;
Tim Peters772747b2001-08-09 22:21:55 +00001878 const unsigned char *q, *e;
1879 int bo = 0; /* assume native ordering by default */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001880 const char *errmsg = "";
Tim Peters772747b2001-08-09 22:21:55 +00001881 /* Offsets from q for retrieving byte pairs in the right order. */
1882#ifdef BYTEORDER_IS_LITTLE_ENDIAN
1883 int ihi = 1, ilo = 0;
1884#else
1885 int ihi = 0, ilo = 1;
1886#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001887 PyObject *errorHandler = NULL;
1888 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001889
1890 /* Note: size will always be longer than the resulting Unicode
1891 character count */
1892 unicode = _PyUnicode_New(size);
1893 if (!unicode)
1894 return NULL;
1895 if (size == 0)
1896 return (PyObject *)unicode;
1897
1898 /* Unpack UTF-16 encoded data */
1899 p = unicode->str;
Tim Peters772747b2001-08-09 22:21:55 +00001900 q = (unsigned char *)s;
1901 e = q + size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001902
1903 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00001904 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001905
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001906 /* Check for BOM marks (U+FEFF) in the input and adjust current
1907 byte order setting accordingly. In native mode, the leading BOM
1908 mark is skipped, in all other modes, it is copied to the output
1909 stream as-is (giving a ZWNBSP character). */
1910 if (bo == 0) {
Walter Dörwald69652032004-09-07 20:24:22 +00001911 if (size >= 2) {
1912 const Py_UNICODE bom = (q[ihi] << 8) | q[ilo];
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001913#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Walter Dörwald69652032004-09-07 20:24:22 +00001914 if (bom == 0xFEFF) {
1915 q += 2;
1916 bo = -1;
1917 }
1918 else if (bom == 0xFFFE) {
1919 q += 2;
1920 bo = 1;
1921 }
Tim Petersced69f82003-09-16 20:30:58 +00001922#else
Walter Dörwald69652032004-09-07 20:24:22 +00001923 if (bom == 0xFEFF) {
1924 q += 2;
1925 bo = 1;
1926 }
1927 else if (bom == 0xFFFE) {
1928 q += 2;
1929 bo = -1;
1930 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001931#endif
Walter Dörwald69652032004-09-07 20:24:22 +00001932 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001933 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001934
Tim Peters772747b2001-08-09 22:21:55 +00001935 if (bo == -1) {
1936 /* force LE */
1937 ihi = 1;
1938 ilo = 0;
1939 }
1940 else if (bo == 1) {
1941 /* force BE */
1942 ihi = 0;
1943 ilo = 1;
1944 }
1945
1946 while (q < e) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001947 Py_UNICODE ch;
Walter Dörwald69652032004-09-07 20:24:22 +00001948 /* remaining bytes at the end? (size should be even) */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001949 if (e-q<2) {
Walter Dörwald69652032004-09-07 20:24:22 +00001950 if (consumed)
1951 break;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001952 errmsg = "truncated data";
1953 startinpos = ((const char *)q)-starts;
1954 endinpos = ((const char *)e)-starts;
1955 goto utf16Error;
1956 /* The remaining input chars are ignored if the callback
1957 chooses to skip the input */
1958 }
1959 ch = (q[ihi] << 8) | q[ilo];
1960
Tim Peters772747b2001-08-09 22:21:55 +00001961 q += 2;
1962
Guido van Rossumd57fd912000-03-10 22:53:23 +00001963 if (ch < 0xD800 || ch > 0xDFFF) {
1964 *p++ = ch;
1965 continue;
1966 }
1967
1968 /* UTF-16 code pair: */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001969 if (q >= e) {
1970 errmsg = "unexpected end of data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001971 startinpos = (((const char *)q)-2)-starts;
1972 endinpos = ((const char *)e)-starts;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001973 goto utf16Error;
1974 }
Martin v. Löwisac93bc22001-06-26 22:43:40 +00001975 if (0xD800 <= ch && ch <= 0xDBFF) {
Tim Peters772747b2001-08-09 22:21:55 +00001976 Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo];
1977 q += 2;
Martin v. Löwisac93bc22001-06-26 22:43:40 +00001978 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Fredrik Lundh8f455852001-06-27 18:59:43 +00001979#ifndef Py_UNICODE_WIDE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001980 *p++ = ch;
1981 *p++ = ch2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001982#else
1983 *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001984#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001985 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001986 }
1987 else {
1988 errmsg = "illegal UTF-16 surrogate";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001989 startinpos = (((const char *)q)-4)-starts;
1990 endinpos = startinpos+2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001991 goto utf16Error;
1992 }
1993
Guido van Rossumd57fd912000-03-10 22:53:23 +00001994 }
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001995 errmsg = "illegal encoding";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001996 startinpos = (((const char *)q)-2)-starts;
1997 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001998 /* Fall through to report the error */
1999
2000 utf16Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002001 outpos = p-PyUnicode_AS_UNICODE(unicode);
2002 if (unicode_decode_call_errorhandler(
2003 errors, &errorHandler,
2004 "utf16", errmsg,
2005 starts, size, &startinpos, &endinpos, &exc, (const char **)&q,
2006 (PyObject **)&unicode, &outpos, &p))
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002007 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002008 }
2009
2010 if (byteorder)
2011 *byteorder = bo;
2012
Walter Dörwald69652032004-09-07 20:24:22 +00002013 if (consumed)
2014 *consumed = (const char *)q-starts;
2015
Guido van Rossumd57fd912000-03-10 22:53:23 +00002016 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00002017 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002018 goto onError;
2019
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002020 Py_XDECREF(errorHandler);
2021 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002022 return (PyObject *)unicode;
2023
2024onError:
2025 Py_DECREF(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002026 Py_XDECREF(errorHandler);
2027 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002028 return NULL;
2029}
2030
Tim Peters772747b2001-08-09 22:21:55 +00002031PyObject *
2032PyUnicode_EncodeUTF16(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002033 Py_ssize_t size,
Tim Peters772747b2001-08-09 22:21:55 +00002034 const char *errors,
2035 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002036{
2037 PyObject *v;
Tim Peters772747b2001-08-09 22:21:55 +00002038 unsigned char *p;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002039#ifdef Py_UNICODE_WIDE
Tim Peters772747b2001-08-09 22:21:55 +00002040 int i, pairs;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002041#else
2042 const int pairs = 0;
2043#endif
Tim Peters772747b2001-08-09 22:21:55 +00002044 /* Offsets from p for storing byte pairs in the right order. */
2045#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2046 int ihi = 1, ilo = 0;
2047#else
2048 int ihi = 0, ilo = 1;
2049#endif
2050
2051#define STORECHAR(CH) \
2052 do { \
2053 p[ihi] = ((CH) >> 8) & 0xff; \
2054 p[ilo] = (CH) & 0xff; \
2055 p += 2; \
2056 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002057
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002058#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002059 for (i = pairs = 0; i < size; i++)
2060 if (s[i] >= 0x10000)
2061 pairs++;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002062#endif
Walter Dörwald3cc34522007-05-04 10:48:27 +00002063 v = PyBytes_FromStringAndSize(NULL,
Tim Peters772747b2001-08-09 22:21:55 +00002064 2 * (size + pairs + (byteorder == 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +00002065 if (v == NULL)
2066 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002067
Walter Dörwald3cc34522007-05-04 10:48:27 +00002068 p = (unsigned char *)PyBytes_AS_STRING(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002069 if (byteorder == 0)
Tim Peters772747b2001-08-09 22:21:55 +00002070 STORECHAR(0xFEFF);
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00002071 if (size == 0)
Marc-André Lemburgb7520772000-08-14 11:29:19 +00002072 return v;
Tim Peters772747b2001-08-09 22:21:55 +00002073
2074 if (byteorder == -1) {
2075 /* force LE */
2076 ihi = 1;
2077 ilo = 0;
2078 }
2079 else if (byteorder == 1) {
2080 /* force BE */
2081 ihi = 0;
2082 ilo = 1;
2083 }
2084
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002085 while (size-- > 0) {
2086 Py_UNICODE ch = *s++;
2087 Py_UNICODE ch2 = 0;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002088#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002089 if (ch >= 0x10000) {
Tim Peters772747b2001-08-09 22:21:55 +00002090 ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF);
2091 ch = 0xD800 | ((ch-0x10000) >> 10);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002092 }
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002093#endif
Tim Peters772747b2001-08-09 22:21:55 +00002094 STORECHAR(ch);
2095 if (ch2)
2096 STORECHAR(ch2);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002097 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002098 return v;
Tim Peters772747b2001-08-09 22:21:55 +00002099#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00002100}
2101
2102PyObject *PyUnicode_AsUTF16String(PyObject *unicode)
2103{
2104 if (!PyUnicode_Check(unicode)) {
2105 PyErr_BadArgument();
2106 return NULL;
2107 }
2108 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
2109 PyUnicode_GET_SIZE(unicode),
2110 NULL,
2111 0);
2112}
2113
2114/* --- Unicode Escape Codec ----------------------------------------------- */
2115
Fredrik Lundh06d12682001-01-24 07:59:11 +00002116static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00002117
Guido van Rossumd57fd912000-03-10 22:53:23 +00002118PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002119 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00002120 const char *errors)
2121{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002122 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002123 Py_ssize_t startinpos;
2124 Py_ssize_t endinpos;
2125 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002126 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002127 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002128 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002129 const char *end;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002130 char* message;
2131 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002132 PyObject *errorHandler = NULL;
2133 PyObject *exc = NULL;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002134
Guido van Rossumd57fd912000-03-10 22:53:23 +00002135 /* Escaped strings will always be longer than the resulting
2136 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002137 length after conversion to the true value.
2138 (but if the error callback returns a long replacement string
2139 we'll have to allocate more space) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002140 v = _PyUnicode_New(size);
2141 if (v == NULL)
2142 goto onError;
2143 if (size == 0)
2144 return (PyObject *)v;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002145
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002146 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002147 end = s + size;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002148
Guido van Rossumd57fd912000-03-10 22:53:23 +00002149 while (s < end) {
2150 unsigned char c;
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00002151 Py_UNICODE x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002152 int digits;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002153
2154 /* Non-escape characters are interpreted as Unicode ordinals */
2155 if (*s != '\\') {
Fredrik Lundhccc74732001-02-18 22:13:49 +00002156 *p++ = (unsigned char) *s++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002157 continue;
2158 }
2159
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002160 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002161 /* \ - Escapes */
2162 s++;
2163 switch (*s++) {
2164
2165 /* \x escapes */
2166 case '\n': break;
2167 case '\\': *p++ = '\\'; break;
2168 case '\'': *p++ = '\''; break;
2169 case '\"': *p++ = '\"'; break;
2170 case 'b': *p++ = '\b'; break;
2171 case 'f': *p++ = '\014'; break; /* FF */
2172 case 't': *p++ = '\t'; break;
2173 case 'n': *p++ = '\n'; break;
2174 case 'r': *p++ = '\r'; break;
2175 case 'v': *p++ = '\013'; break; /* VT */
2176 case 'a': *p++ = '\007'; break; /* BEL, not classic C */
2177
2178 /* \OOO (octal) escapes */
2179 case '0': case '1': case '2': case '3':
2180 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00002181 x = s[-1] - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00002182 if ('0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00002183 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00002184 if ('0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00002185 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00002186 }
Guido van Rossum0e4f6572000-05-01 21:27:20 +00002187 *p++ = x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002188 break;
2189
Fredrik Lundhccc74732001-02-18 22:13:49 +00002190 /* hex escapes */
2191 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002192 case 'x':
Fredrik Lundhccc74732001-02-18 22:13:49 +00002193 digits = 2;
2194 message = "truncated \\xXX escape";
2195 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002196
Fredrik Lundhccc74732001-02-18 22:13:49 +00002197 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002198 case 'u':
Fredrik Lundhccc74732001-02-18 22:13:49 +00002199 digits = 4;
2200 message = "truncated \\uXXXX escape";
2201 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002202
Fredrik Lundhccc74732001-02-18 22:13:49 +00002203 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00002204 case 'U':
Fredrik Lundhccc74732001-02-18 22:13:49 +00002205 digits = 8;
2206 message = "truncated \\UXXXXXXXX escape";
2207 hexescape:
2208 chr = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002209 outpos = p-PyUnicode_AS_UNICODE(v);
2210 if (s+digits>end) {
2211 endinpos = size;
2212 if (unicode_decode_call_errorhandler(
2213 errors, &errorHandler,
2214 "unicodeescape", "end of string in escape sequence",
2215 starts, size, &startinpos, &endinpos, &exc, &s,
2216 (PyObject **)&v, &outpos, &p))
2217 goto onError;
2218 goto nextByte;
2219 }
2220 for (i = 0; i < digits; ++i) {
Fredrik Lundhccc74732001-02-18 22:13:49 +00002221 c = (unsigned char) s[i];
Fredrik Lundhdf846752000-09-03 11:29:49 +00002222 if (!isxdigit(c)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002223 endinpos = (s+i+1)-starts;
2224 if (unicode_decode_call_errorhandler(
2225 errors, &errorHandler,
2226 "unicodeescape", message,
2227 starts, size, &startinpos, &endinpos, &exc, &s,
2228 (PyObject **)&v, &outpos, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00002229 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002230 goto nextByte;
Fredrik Lundhdf846752000-09-03 11:29:49 +00002231 }
2232 chr = (chr<<4) & ~0xF;
2233 if (c >= '0' && c <= '9')
2234 chr += c - '0';
2235 else if (c >= 'a' && c <= 'f')
2236 chr += 10 + c - 'a';
2237 else
2238 chr += 10 + c - 'A';
2239 }
2240 s += i;
Jeremy Hylton504de6b2003-10-06 05:08:26 +00002241 if (chr == 0xffffffff && PyErr_Occurred())
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002242 /* _decoding_error will have already written into the
2243 target buffer. */
2244 break;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002245 store:
Fredrik Lundhdf846752000-09-03 11:29:49 +00002246 /* when we get here, chr is a 32-bit unicode character */
2247 if (chr <= 0xffff)
2248 /* UCS-2 character */
2249 *p++ = (Py_UNICODE) chr;
2250 else if (chr <= 0x10ffff) {
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002251 /* UCS-4 character. Either store directly, or as
Walter Dörwald8c077222002-03-25 11:16:18 +00002252 surrogate pair. */
Fredrik Lundh8f455852001-06-27 18:59:43 +00002253#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002254 *p++ = chr;
2255#else
Fredrik Lundhdf846752000-09-03 11:29:49 +00002256 chr -= 0x10000L;
2257 *p++ = 0xD800 + (Py_UNICODE) (chr >> 10);
Fredrik Lundh45714e92001-06-26 16:39:36 +00002258 *p++ = 0xDC00 + (Py_UNICODE) (chr & 0x03FF);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002259#endif
Fredrik Lundhdf846752000-09-03 11:29:49 +00002260 } else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002261 endinpos = s-starts;
2262 outpos = p-PyUnicode_AS_UNICODE(v);
2263 if (unicode_decode_call_errorhandler(
2264 errors, &errorHandler,
2265 "unicodeescape", "illegal Unicode character",
2266 starts, size, &startinpos, &endinpos, &exc, &s,
2267 (PyObject **)&v, &outpos, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00002268 goto onError;
2269 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00002270 break;
2271
2272 /* \N{name} */
2273 case 'N':
2274 message = "malformed \\N character escape";
2275 if (ucnhash_CAPI == NULL) {
2276 /* load the unicode data module */
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00002277 PyObject *m, *api;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002278 m = PyImport_ImportModule("unicodedata");
2279 if (m == NULL)
2280 goto ucnhashError;
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00002281 api = PyObject_GetAttrString(m, "ucnhash_CAPI");
Fredrik Lundhccc74732001-02-18 22:13:49 +00002282 Py_DECREF(m);
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00002283 if (api == NULL)
Fredrik Lundhccc74732001-02-18 22:13:49 +00002284 goto ucnhashError;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002285 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCObject_AsVoidPtr(api);
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00002286 Py_DECREF(api);
Fredrik Lundhccc74732001-02-18 22:13:49 +00002287 if (ucnhash_CAPI == NULL)
2288 goto ucnhashError;
2289 }
2290 if (*s == '{') {
2291 const char *start = s+1;
2292 /* look for the closing brace */
2293 while (*s != '}' && s < end)
2294 s++;
2295 if (s > start && s < end && *s == '}') {
2296 /* found a name. look it up in the unicode database */
2297 message = "unknown Unicode character name";
2298 s++;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +00002299 if (ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1), &chr))
Fredrik Lundhccc74732001-02-18 22:13:49 +00002300 goto store;
2301 }
2302 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002303 endinpos = s-starts;
2304 outpos = p-PyUnicode_AS_UNICODE(v);
2305 if (unicode_decode_call_errorhandler(
2306 errors, &errorHandler,
2307 "unicodeescape", message,
2308 starts, size, &startinpos, &endinpos, &exc, &s,
2309 (PyObject **)&v, &outpos, &p))
Fredrik Lundhccc74732001-02-18 22:13:49 +00002310 goto onError;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002311 break;
2312
2313 default:
Walter Dörwald8c077222002-03-25 11:16:18 +00002314 if (s > end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002315 message = "\\ at end of string";
2316 s--;
2317 endinpos = s-starts;
2318 outpos = p-PyUnicode_AS_UNICODE(v);
2319 if (unicode_decode_call_errorhandler(
2320 errors, &errorHandler,
2321 "unicodeescape", message,
2322 starts, size, &startinpos, &endinpos, &exc, &s,
2323 (PyObject **)&v, &outpos, &p))
Walter Dörwald8c077222002-03-25 11:16:18 +00002324 goto onError;
2325 }
2326 else {
2327 *p++ = '\\';
2328 *p++ = (unsigned char)s[-1];
2329 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00002330 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002331 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002332 nextByte:
2333 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002334 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002335 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002336 goto onError;
Walter Dörwaldd4ade082003-08-15 15:00:26 +00002337 Py_XDECREF(errorHandler);
2338 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002339 return (PyObject *)v;
Walter Dörwald8c077222002-03-25 11:16:18 +00002340
Fredrik Lundhccc74732001-02-18 22:13:49 +00002341ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00002342 PyErr_SetString(
2343 PyExc_UnicodeError,
2344 "\\N escapes not supported (can't load unicodedata module)"
2345 );
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00002346 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002347 Py_XDECREF(errorHandler);
2348 Py_XDECREF(exc);
Fredrik Lundhf6056062001-01-20 11:15:25 +00002349 return NULL;
2350
Fredrik Lundhccc74732001-02-18 22:13:49 +00002351onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00002352 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002353 Py_XDECREF(errorHandler);
2354 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002355 return NULL;
2356}
2357
2358/* Return a Unicode-Escape string version of the Unicode object.
2359
2360 If quotes is true, the string is enclosed in u"" or u'' quotes as
2361 appropriate.
2362
2363*/
2364
Thomas Wouters477c8d52006-05-27 19:21:47 +00002365Py_LOCAL_INLINE(const Py_UNICODE *) findchar(const Py_UNICODE *s,
2366 Py_ssize_t size,
2367 Py_UNICODE ch)
2368{
2369 /* like wcschr, but doesn't stop at NULL characters */
2370
2371 while (size-- > 0) {
2372 if (*s == ch)
2373 return s;
2374 s++;
2375 }
2376
2377 return NULL;
2378}
Barry Warsaw51ac5802000-03-20 16:36:48 +00002379
Walter Dörwald79e913e2007-05-12 11:08:06 +00002380static const char *hexdigits = "0123456789abcdef";
2381
2382PyObject *PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
2383 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002384{
2385 PyObject *repr;
2386 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002387
Thomas Wouters89f507f2006-12-13 04:49:30 +00002388 /* XXX(nnorwitz): rather than over-allocating, it would be
2389 better to choose a different scheme. Perhaps scan the
2390 first N-chars of the string and allocate based on that size.
2391 */
2392 /* Initial allocation is based on the longest-possible unichr
2393 escape.
2394
2395 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
2396 unichr, so in this case it's the longest unichr escape. In
2397 narrow (UTF-16) builds this is five chars per source unichr
2398 since there are two unichrs in the surrogate pair, so in narrow
2399 (UTF-16) builds it's not the longest unichr escape.
2400
2401 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
2402 so in the narrow (UTF-16) build case it's the longest unichr
2403 escape.
2404 */
2405
Walter Dörwald79e913e2007-05-12 11:08:06 +00002406 repr = PyBytes_FromStringAndSize(NULL,
Thomas Wouters89f507f2006-12-13 04:49:30 +00002407#ifdef Py_UNICODE_WIDE
2408 + 10*size
2409#else
2410 + 6*size
2411#endif
2412 + 1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002413 if (repr == NULL)
2414 return NULL;
2415
Walter Dörwald79e913e2007-05-12 11:08:06 +00002416 p = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002417
Guido van Rossumd57fd912000-03-10 22:53:23 +00002418 while (size-- > 0) {
2419 Py_UNICODE ch = *s++;
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002420
Walter Dörwald79e913e2007-05-12 11:08:06 +00002421 /* Escape backslashes */
2422 if (ch == '\\') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002423 *p++ = '\\';
2424 *p++ = (char) ch;
Walter Dörwald79e913e2007-05-12 11:08:06 +00002425 continue;
Tim Petersced69f82003-09-16 20:30:58 +00002426 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002427
Guido van Rossum0d42e0c2001-07-20 16:36:21 +00002428#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002429 /* Map 21-bit characters to '\U00xxxxxx' */
2430 else if (ch >= 0x10000) {
2431 *p++ = '\\';
2432 *p++ = 'U';
Walter Dörwald79e913e2007-05-12 11:08:06 +00002433 *p++ = hexdigits[(ch >> 28) & 0x0000000F];
2434 *p++ = hexdigits[(ch >> 24) & 0x0000000F];
2435 *p++ = hexdigits[(ch >> 20) & 0x0000000F];
2436 *p++ = hexdigits[(ch >> 16) & 0x0000000F];
2437 *p++ = hexdigits[(ch >> 12) & 0x0000000F];
2438 *p++ = hexdigits[(ch >> 8) & 0x0000000F];
2439 *p++ = hexdigits[(ch >> 4) & 0x0000000F];
2440 *p++ = hexdigits[ch & 0x0000000F];
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002441 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002442 }
Thomas Wouters89f507f2006-12-13 04:49:30 +00002443#else
2444 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002445 else if (ch >= 0xD800 && ch < 0xDC00) {
2446 Py_UNICODE ch2;
2447 Py_UCS4 ucs;
Tim Petersced69f82003-09-16 20:30:58 +00002448
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002449 ch2 = *s++;
2450 size--;
2451 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
2452 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
2453 *p++ = '\\';
2454 *p++ = 'U';
Walter Dörwald79e913e2007-05-12 11:08:06 +00002455 *p++ = hexdigits[(ucs >> 28) & 0x0000000F];
2456 *p++ = hexdigits[(ucs >> 24) & 0x0000000F];
2457 *p++ = hexdigits[(ucs >> 20) & 0x0000000F];
2458 *p++ = hexdigits[(ucs >> 16) & 0x0000000F];
2459 *p++ = hexdigits[(ucs >> 12) & 0x0000000F];
2460 *p++ = hexdigits[(ucs >> 8) & 0x0000000F];
2461 *p++ = hexdigits[(ucs >> 4) & 0x0000000F];
2462 *p++ = hexdigits[ucs & 0x0000000F];
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002463 continue;
2464 }
2465 /* Fall through: isolated surrogates are copied as-is */
2466 s--;
2467 size++;
2468 }
Thomas Wouters89f507f2006-12-13 04:49:30 +00002469#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002470
Guido van Rossumd57fd912000-03-10 22:53:23 +00002471 /* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002472 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002473 *p++ = '\\';
2474 *p++ = 'u';
Walter Dörwald79e913e2007-05-12 11:08:06 +00002475 *p++ = hexdigits[(ch >> 12) & 0x000F];
2476 *p++ = hexdigits[(ch >> 8) & 0x000F];
2477 *p++ = hexdigits[(ch >> 4) & 0x000F];
2478 *p++ = hexdigits[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00002479 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002480
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00002481 /* Map special whitespace to '\t', \n', '\r' */
2482 else if (ch == '\t') {
2483 *p++ = '\\';
2484 *p++ = 't';
2485 }
2486 else if (ch == '\n') {
2487 *p++ = '\\';
2488 *p++ = 'n';
2489 }
2490 else if (ch == '\r') {
2491 *p++ = '\\';
2492 *p++ = 'r';
2493 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002494
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00002495 /* Map non-printable US ASCII to '\xhh' */
Marc-André Lemburg11326de2001-11-28 12:56:20 +00002496 else if (ch < ' ' || ch >= 0x7F) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002497 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00002498 *p++ = 'x';
Walter Dörwald79e913e2007-05-12 11:08:06 +00002499 *p++ = hexdigits[(ch >> 4) & 0x000F];
2500 *p++ = hexdigits[ch & 0x000F];
Tim Petersced69f82003-09-16 20:30:58 +00002501 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002502
Guido van Rossumd57fd912000-03-10 22:53:23 +00002503 /* Copy everything else as-is */
2504 else
2505 *p++ = (char) ch;
2506 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002507
2508 *p = '\0';
Walter Dörwald79e913e2007-05-12 11:08:06 +00002509 if (PyBytes_Resize(repr, p - PyBytes_AS_STRING(repr))) {
2510 Py_DECREF(repr);
2511 return NULL;
2512 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002513 return repr;
2514}
2515
Guido van Rossumd57fd912000-03-10 22:53:23 +00002516PyObject *PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
2517{
Walter Dörwald79e913e2007-05-12 11:08:06 +00002518 PyObject *s, *result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002519 if (!PyUnicode_Check(unicode)) {
2520 PyErr_BadArgument();
2521 return NULL;
2522 }
Walter Dörwald79e913e2007-05-12 11:08:06 +00002523 s = PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
2524 PyUnicode_GET_SIZE(unicode));
2525
2526 if (!s)
2527 return NULL;
2528 result = PyString_FromStringAndSize(PyBytes_AS_STRING(s),
2529 PyBytes_GET_SIZE(s));
2530 Py_DECREF(s);
2531 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002532}
2533
2534/* --- Raw Unicode Escape Codec ------------------------------------------- */
2535
2536PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002537 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00002538 const char *errors)
2539{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002540 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002541 Py_ssize_t startinpos;
2542 Py_ssize_t endinpos;
2543 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002544 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002545 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002546 const char *end;
2547 const char *bs;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002548 PyObject *errorHandler = NULL;
2549 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00002550
Guido van Rossumd57fd912000-03-10 22:53:23 +00002551 /* Escaped strings will always be longer than the resulting
2552 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002553 length after conversion to the true value. (But decoding error
2554 handler might have to resize the string) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002555 v = _PyUnicode_New(size);
2556 if (v == NULL)
2557 goto onError;
2558 if (size == 0)
2559 return (PyObject *)v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002560 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002561 end = s + size;
2562 while (s < end) {
2563 unsigned char c;
Martin v. Löwis047c05e2002-03-21 08:55:28 +00002564 Py_UCS4 x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002565 int i;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002566 int count;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002567
2568 /* Non-escape characters are interpreted as Unicode ordinals */
2569 if (*s != '\\') {
2570 *p++ = (unsigned char)*s++;
2571 continue;
2572 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002573 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002574
2575 /* \u-escapes are only interpreted iff the number of leading
2576 backslashes if odd */
2577 bs = s;
2578 for (;s < end;) {
2579 if (*s != '\\')
2580 break;
2581 *p++ = (unsigned char)*s++;
2582 }
2583 if (((s - bs) & 1) == 0 ||
2584 s >= end ||
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002585 (*s != 'u' && *s != 'U')) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002586 continue;
2587 }
2588 p--;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002589 count = *s=='u' ? 4 : 8;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002590 s++;
2591
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002592 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002593 outpos = p-PyUnicode_AS_UNICODE(v);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002594 for (x = 0, i = 0; i < count; ++i, ++s) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002595 c = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002596 if (!isxdigit(c)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002597 endinpos = s-starts;
2598 if (unicode_decode_call_errorhandler(
2599 errors, &errorHandler,
2600 "rawunicodeescape", "truncated \\uXXXX",
2601 starts, size, &startinpos, &endinpos, &exc, &s,
2602 (PyObject **)&v, &outpos, &p))
Guido van Rossumd57fd912000-03-10 22:53:23 +00002603 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002604 goto nextByte;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002605 }
2606 x = (x<<4) & ~0xF;
2607 if (c >= '0' && c <= '9')
2608 x += c - '0';
2609 else if (c >= 'a' && c <= 'f')
2610 x += 10 + c - 'a';
2611 else
2612 x += 10 + c - 'A';
2613 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002614#ifndef Py_UNICODE_WIDE
2615 if (x > 0x10000) {
2616 if (unicode_decode_call_errorhandler(
2617 errors, &errorHandler,
2618 "rawunicodeescape", "\\Uxxxxxxxx out of range",
2619 starts, size, &startinpos, &endinpos, &exc, &s,
2620 (PyObject **)&v, &outpos, &p))
2621 goto onError;
2622 }
2623#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002624 *p++ = x;
2625 nextByte:
2626 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002627 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002628 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002629 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002630 Py_XDECREF(errorHandler);
2631 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002632 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00002633
Guido van Rossumd57fd912000-03-10 22:53:23 +00002634 onError:
2635 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002636 Py_XDECREF(errorHandler);
2637 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002638 return NULL;
2639}
2640
2641PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002642 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002643{
2644 PyObject *repr;
2645 char *p;
2646 char *q;
2647
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002648#ifdef Py_UNICODE_WIDE
Walter Dörwald711005d2007-05-12 12:03:26 +00002649 repr = PyBytes_FromStringAndSize(NULL, 10 * size);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002650#else
Walter Dörwald711005d2007-05-12 12:03:26 +00002651 repr = PyBytes_FromStringAndSize(NULL, 6 * size);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002652#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00002653 if (repr == NULL)
2654 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00002655 if (size == 0)
2656 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002657
Walter Dörwald711005d2007-05-12 12:03:26 +00002658 p = q = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002659 while (size-- > 0) {
2660 Py_UNICODE ch = *s++;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002661#ifdef Py_UNICODE_WIDE
2662 /* Map 32-bit characters to '\Uxxxxxxxx' */
2663 if (ch >= 0x10000) {
2664 *p++ = '\\';
2665 *p++ = 'U';
Walter Dörwalddb5d33e2007-05-12 11:13:47 +00002666 *p++ = hexdigits[(ch >> 28) & 0xf];
2667 *p++ = hexdigits[(ch >> 24) & 0xf];
2668 *p++ = hexdigits[(ch >> 20) & 0xf];
2669 *p++ = hexdigits[(ch >> 16) & 0xf];
2670 *p++ = hexdigits[(ch >> 12) & 0xf];
2671 *p++ = hexdigits[(ch >> 8) & 0xf];
2672 *p++ = hexdigits[(ch >> 4) & 0xf];
2673 *p++ = hexdigits[ch & 15];
Tim Petersced69f82003-09-16 20:30:58 +00002674 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002675 else
2676#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00002677 /* Map 16-bit characters to '\uxxxx' */
2678 if (ch >= 256) {
2679 *p++ = '\\';
2680 *p++ = 'u';
Walter Dörwalddb5d33e2007-05-12 11:13:47 +00002681 *p++ = hexdigits[(ch >> 12) & 0xf];
2682 *p++ = hexdigits[(ch >> 8) & 0xf];
2683 *p++ = hexdigits[(ch >> 4) & 0xf];
2684 *p++ = hexdigits[ch & 15];
Guido van Rossumd57fd912000-03-10 22:53:23 +00002685 }
2686 /* Copy everything else as-is */
2687 else
2688 *p++ = (char) ch;
2689 }
2690 *p = '\0';
Walter Dörwald711005d2007-05-12 12:03:26 +00002691 if (PyBytes_Resize(repr, p - q)) {
2692 Py_DECREF(repr);
2693 return NULL;
2694 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002695 return repr;
2696}
2697
2698PyObject *PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
2699{
Walter Dörwald711005d2007-05-12 12:03:26 +00002700 PyObject *s, *result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002701 if (!PyUnicode_Check(unicode)) {
Walter Dörwald711005d2007-05-12 12:03:26 +00002702 PyErr_BadArgument();
2703 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002704 }
Walter Dörwald711005d2007-05-12 12:03:26 +00002705 s = PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
2706 PyUnicode_GET_SIZE(unicode));
2707
2708 if (!s)
2709 return NULL;
2710 result = PyString_FromStringAndSize(PyBytes_AS_STRING(s),
2711 PyBytes_GET_SIZE(s));
2712 Py_DECREF(s);
2713 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002714}
2715
Walter Dörwalda47d1c02005-08-30 10:23:14 +00002716/* --- Unicode Internal Codec ------------------------------------------- */
2717
2718PyObject *_PyUnicode_DecodeUnicodeInternal(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002719 Py_ssize_t size,
Walter Dörwalda47d1c02005-08-30 10:23:14 +00002720 const char *errors)
2721{
2722 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002723 Py_ssize_t startinpos;
2724 Py_ssize_t endinpos;
2725 Py_ssize_t outpos;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00002726 PyUnicodeObject *v;
2727 Py_UNICODE *p;
2728 const char *end;
2729 const char *reason;
2730 PyObject *errorHandler = NULL;
2731 PyObject *exc = NULL;
2732
Neal Norwitzd43069c2006-01-08 01:12:10 +00002733#ifdef Py_UNICODE_WIDE
2734 Py_UNICODE unimax = PyUnicode_GetMax();
2735#endif
2736
Thomas Wouters89f507f2006-12-13 04:49:30 +00002737 /* XXX overflow detection missing */
Walter Dörwalda47d1c02005-08-30 10:23:14 +00002738 v = _PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE);
2739 if (v == NULL)
2740 goto onError;
2741 if (PyUnicode_GetSize((PyObject *)v) == 0)
2742 return (PyObject *)v;
2743 p = PyUnicode_AS_UNICODE(v);
2744 end = s + size;
2745
2746 while (s < end) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00002747 memcpy(p, s, sizeof(Py_UNICODE));
Walter Dörwalda47d1c02005-08-30 10:23:14 +00002748 /* We have to sanity check the raw data, otherwise doom looms for
2749 some malformed UCS-4 data. */
2750 if (
2751 #ifdef Py_UNICODE_WIDE
2752 *p > unimax || *p < 0 ||
2753 #endif
2754 end-s < Py_UNICODE_SIZE
2755 )
2756 {
2757 startinpos = s - starts;
2758 if (end-s < Py_UNICODE_SIZE) {
2759 endinpos = end-starts;
2760 reason = "truncated input";
2761 }
2762 else {
2763 endinpos = s - starts + Py_UNICODE_SIZE;
2764 reason = "illegal code point (> 0x10FFFF)";
2765 }
2766 outpos = p - PyUnicode_AS_UNICODE(v);
2767 if (unicode_decode_call_errorhandler(
2768 errors, &errorHandler,
2769 "unicode_internal", reason,
2770 starts, size, &startinpos, &endinpos, &exc, &s,
2771 (PyObject **)&v, &outpos, &p)) {
2772 goto onError;
2773 }
2774 }
2775 else {
2776 p++;
2777 s += Py_UNICODE_SIZE;
2778 }
2779 }
2780
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002781 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00002782 goto onError;
2783 Py_XDECREF(errorHandler);
2784 Py_XDECREF(exc);
2785 return (PyObject *)v;
2786
2787 onError:
2788 Py_XDECREF(v);
2789 Py_XDECREF(errorHandler);
2790 Py_XDECREF(exc);
2791 return NULL;
2792}
2793
Guido van Rossumd57fd912000-03-10 22:53:23 +00002794/* --- Latin-1 Codec ------------------------------------------------------ */
2795
2796PyObject *PyUnicode_DecodeLatin1(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002797 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00002798 const char *errors)
2799{
2800 PyUnicodeObject *v;
2801 Py_UNICODE *p;
Tim Petersced69f82003-09-16 20:30:58 +00002802
Guido van Rossumd57fd912000-03-10 22:53:23 +00002803 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002804 if (size == 1) {
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002805 Py_UNICODE r = *(unsigned char*)s;
2806 return PyUnicode_FromUnicode(&r, 1);
2807 }
2808
Guido van Rossumd57fd912000-03-10 22:53:23 +00002809 v = _PyUnicode_New(size);
2810 if (v == NULL)
2811 goto onError;
2812 if (size == 0)
2813 return (PyObject *)v;
2814 p = PyUnicode_AS_UNICODE(v);
2815 while (size-- > 0)
2816 *p++ = (unsigned char)*s++;
2817 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00002818
Guido van Rossumd57fd912000-03-10 22:53:23 +00002819 onError:
2820 Py_XDECREF(v);
2821 return NULL;
2822}
2823
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002824/* create or adjust a UnicodeEncodeError */
2825static void make_encode_exception(PyObject **exceptionObject,
2826 const char *encoding,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002827 const Py_UNICODE *unicode, Py_ssize_t size,
2828 Py_ssize_t startpos, Py_ssize_t endpos,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002829 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002830{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002831 if (*exceptionObject == NULL) {
2832 *exceptionObject = PyUnicodeEncodeError_Create(
2833 encoding, unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002834 }
2835 else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002836 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
2837 goto onError;
2838 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
2839 goto onError;
2840 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
2841 goto onError;
2842 return;
2843 onError:
2844 Py_DECREF(*exceptionObject);
2845 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002846 }
2847}
2848
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002849/* raises a UnicodeEncodeError */
2850static void raise_encode_exception(PyObject **exceptionObject,
2851 const char *encoding,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002852 const Py_UNICODE *unicode, Py_ssize_t size,
2853 Py_ssize_t startpos, Py_ssize_t endpos,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002854 const char *reason)
2855{
2856 make_encode_exception(exceptionObject,
2857 encoding, unicode, size, startpos, endpos, reason);
2858 if (*exceptionObject != NULL)
2859 PyCodec_StrictErrors(*exceptionObject);
2860}
2861
2862/* error handling callback helper:
2863 build arguments, call the callback and check the arguments,
2864 put the result into newpos and return the replacement string, which
2865 has to be freed by the caller */
2866static PyObject *unicode_encode_call_errorhandler(const char *errors,
2867 PyObject **errorHandler,
2868 const char *encoding, const char *reason,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002869 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
2870 Py_ssize_t startpos, Py_ssize_t endpos,
2871 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002872{
Martin v. Löwis18e16552006-02-15 17:27:45 +00002873 static char *argparse = "O!n;encoding error handler must return (unicode, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002874
2875 PyObject *restuple;
2876 PyObject *resunicode;
2877
2878 if (*errorHandler == NULL) {
2879 *errorHandler = PyCodec_LookupError(errors);
2880 if (*errorHandler == NULL)
2881 return NULL;
2882 }
2883
2884 make_encode_exception(exceptionObject,
2885 encoding, unicode, size, startpos, endpos, reason);
2886 if (*exceptionObject == NULL)
2887 return NULL;
2888
2889 restuple = PyObject_CallFunctionObjArgs(
2890 *errorHandler, *exceptionObject, NULL);
2891 if (restuple == NULL)
2892 return NULL;
2893 if (!PyTuple_Check(restuple)) {
2894 PyErr_Format(PyExc_TypeError, &argparse[4]);
2895 Py_DECREF(restuple);
2896 return NULL;
2897 }
2898 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
2899 &resunicode, newpos)) {
2900 Py_DECREF(restuple);
2901 return NULL;
2902 }
2903 if (*newpos<0)
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00002904 *newpos = size+*newpos;
2905 if (*newpos<0 || *newpos>size) {
Martin v. Löwis2c95cc62006-02-16 06:54:25 +00002906 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00002907 Py_DECREF(restuple);
2908 return NULL;
2909 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002910 Py_INCREF(resunicode);
2911 Py_DECREF(restuple);
2912 return resunicode;
2913}
2914
2915static PyObject *unicode_encode_ucs1(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002916 Py_ssize_t size,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002917 const char *errors,
2918 int limit)
2919{
2920 /* output object */
2921 PyObject *res;
2922 /* pointers to the beginning and end+1 of input */
2923 const Py_UNICODE *startp = p;
2924 const Py_UNICODE *endp = p + size;
2925 /* pointer to the beginning of the unencodable characters */
2926 /* const Py_UNICODE *badp = NULL; */
2927 /* pointer into the output */
2928 char *str;
2929 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00002930 Py_ssize_t respos = 0;
2931 Py_ssize_t ressize;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002932 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
2933 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002934 PyObject *errorHandler = NULL;
2935 PyObject *exc = NULL;
2936 /* the following variable is used for caching string comparisons
2937 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
2938 int known_errorHandler = -1;
2939
2940 /* allocate enough for a simple encoding without
2941 replacements, if we need more, we'll resize */
Guido van Rossumf15a29f2007-05-04 00:41:39 +00002942 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002943 if (res == NULL)
2944 goto onError;
2945 if (size == 0)
2946 return res;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00002947 str = PyBytes_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002948 ressize = size;
2949
2950 while (p<endp) {
2951 Py_UNICODE c = *p;
2952
2953 /* can we encode this? */
2954 if (c<limit) {
2955 /* no overflow check, because we know that the space is enough */
2956 *str++ = (char)c;
2957 ++p;
2958 }
2959 else {
Martin v. Löwis18e16552006-02-15 17:27:45 +00002960 Py_ssize_t unicodepos = p-startp;
2961 Py_ssize_t requiredsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002962 PyObject *repunicode;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002963 Py_ssize_t repsize;
2964 Py_ssize_t newpos;
2965 Py_ssize_t respos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002966 Py_UNICODE *uni2;
2967 /* startpos for collecting unencodable chars */
2968 const Py_UNICODE *collstart = p;
2969 const Py_UNICODE *collend = p;
2970 /* find all unecodable characters */
2971 while ((collend < endp) && ((*collend)>=limit))
2972 ++collend;
2973 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
2974 if (known_errorHandler==-1) {
2975 if ((errors==NULL) || (!strcmp(errors, "strict")))
2976 known_errorHandler = 1;
2977 else if (!strcmp(errors, "replace"))
2978 known_errorHandler = 2;
2979 else if (!strcmp(errors, "ignore"))
2980 known_errorHandler = 3;
2981 else if (!strcmp(errors, "xmlcharrefreplace"))
2982 known_errorHandler = 4;
2983 else
2984 known_errorHandler = 0;
2985 }
2986 switch (known_errorHandler) {
2987 case 1: /* strict */
2988 raise_encode_exception(&exc, encoding, startp, size, collstart-startp, collend-startp, reason);
2989 goto onError;
2990 case 2: /* replace */
2991 while (collstart++<collend)
2992 *str++ = '?'; /* fall through */
2993 case 3: /* ignore */
2994 p = collend;
2995 break;
2996 case 4: /* xmlcharrefreplace */
Guido van Rossumf15a29f2007-05-04 00:41:39 +00002997 respos = str - PyBytes_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002998 /* determine replacement size (temporarily (mis)uses p) */
2999 for (p = collstart, repsize = 0; p < collend; ++p) {
3000 if (*p<10)
3001 repsize += 2+1+1;
3002 else if (*p<100)
3003 repsize += 2+2+1;
3004 else if (*p<1000)
3005 repsize += 2+3+1;
3006 else if (*p<10000)
3007 repsize += 2+4+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00003008#ifndef Py_UNICODE_WIDE
3009 else
3010 repsize += 2+5+1;
3011#else
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003012 else if (*p<100000)
3013 repsize += 2+5+1;
3014 else if (*p<1000000)
3015 repsize += 2+6+1;
3016 else
3017 repsize += 2+7+1;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003018#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003019 }
3020 requiredsize = respos+repsize+(endp-collend);
3021 if (requiredsize > ressize) {
3022 if (requiredsize<2*ressize)
3023 requiredsize = 2*ressize;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003024 if (PyBytes_Resize(res, requiredsize))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003025 goto onError;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003026 str = PyBytes_AS_STRING(res) + respos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003027 ressize = requiredsize;
3028 }
3029 /* generate replacement (temporarily (mis)uses p) */
3030 for (p = collstart; p < collend; ++p) {
3031 str += sprintf(str, "&#%d;", (int)*p);
3032 }
3033 p = collend;
3034 break;
3035 default:
3036 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
3037 encoding, reason, startp, size, &exc,
3038 collstart-startp, collend-startp, &newpos);
3039 if (repunicode == NULL)
3040 goto onError;
3041 /* need more space? (at least enough for what we
3042 have+the replacement+the rest of the string, so
3043 we won't have to check space for encodable characters) */
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003044 respos = str - PyBytes_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003045 repsize = PyUnicode_GET_SIZE(repunicode);
3046 requiredsize = respos+repsize+(endp-collend);
3047 if (requiredsize > ressize) {
3048 if (requiredsize<2*ressize)
3049 requiredsize = 2*ressize;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003050 if (PyBytes_Resize(res, requiredsize)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003051 Py_DECREF(repunicode);
3052 goto onError;
3053 }
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003054 str = PyBytes_AS_STRING(res) + respos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003055 ressize = requiredsize;
3056 }
3057 /* check if there is anything unencodable in the replacement
3058 and copy it to the output */
3059 for (uni2 = PyUnicode_AS_UNICODE(repunicode);repsize-->0; ++uni2, ++str) {
3060 c = *uni2;
3061 if (c >= limit) {
3062 raise_encode_exception(&exc, encoding, startp, size,
3063 unicodepos, unicodepos+1, reason);
3064 Py_DECREF(repunicode);
3065 goto onError;
3066 }
3067 *str = (char)c;
3068 }
3069 p = startp + newpos;
3070 Py_DECREF(repunicode);
3071 }
3072 }
3073 }
3074 /* Resize if we allocated to much */
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003075 respos = str - PyBytes_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003076 if (respos<ressize)
3077 /* If this falls res will be NULL */
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003078 PyBytes_Resize(res, respos);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003079 Py_XDECREF(errorHandler);
3080 Py_XDECREF(exc);
3081 return res;
3082
3083 onError:
3084 Py_XDECREF(res);
3085 Py_XDECREF(errorHandler);
3086 Py_XDECREF(exc);
3087 return NULL;
3088}
3089
Guido van Rossumd57fd912000-03-10 22:53:23 +00003090PyObject *PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003091 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003092 const char *errors)
3093{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003094 return unicode_encode_ucs1(p, size, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003095}
3096
3097PyObject *PyUnicode_AsLatin1String(PyObject *unicode)
3098{
3099 if (!PyUnicode_Check(unicode)) {
3100 PyErr_BadArgument();
3101 return NULL;
3102 }
3103 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
3104 PyUnicode_GET_SIZE(unicode),
3105 NULL);
3106}
3107
3108/* --- 7-bit ASCII Codec -------------------------------------------------- */
3109
Guido van Rossumd57fd912000-03-10 22:53:23 +00003110PyObject *PyUnicode_DecodeASCII(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003111 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003112 const char *errors)
3113{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003114 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003115 PyUnicodeObject *v;
3116 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003117 Py_ssize_t startinpos;
3118 Py_ssize_t endinpos;
3119 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003120 const char *e;
3121 PyObject *errorHandler = NULL;
3122 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00003123
Guido van Rossumd57fd912000-03-10 22:53:23 +00003124 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003125 if (size == 1 && *(unsigned char*)s < 128) {
3126 Py_UNICODE r = *(unsigned char*)s;
3127 return PyUnicode_FromUnicode(&r, 1);
3128 }
Tim Petersced69f82003-09-16 20:30:58 +00003129
Guido van Rossumd57fd912000-03-10 22:53:23 +00003130 v = _PyUnicode_New(size);
3131 if (v == NULL)
3132 goto onError;
3133 if (size == 0)
3134 return (PyObject *)v;
3135 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003136 e = s + size;
3137 while (s < e) {
3138 register unsigned char c = (unsigned char)*s;
3139 if (c < 128) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003140 *p++ = c;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003141 ++s;
3142 }
3143 else {
3144 startinpos = s-starts;
3145 endinpos = startinpos + 1;
Jeremy Hyltond8082792003-09-16 19:41:39 +00003146 outpos = p - (Py_UNICODE *)PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003147 if (unicode_decode_call_errorhandler(
3148 errors, &errorHandler,
3149 "ascii", "ordinal not in range(128)",
3150 starts, size, &startinpos, &endinpos, &exc, &s,
3151 (PyObject **)&v, &outpos, &p))
Guido van Rossumd57fd912000-03-10 22:53:23 +00003152 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003153 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003154 }
Guido van Rossumfd4b9572000-04-10 13:51:10 +00003155 if (p - PyUnicode_AS_UNICODE(v) < PyString_GET_SIZE(v))
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003156 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Guido van Rossumfd4b9572000-04-10 13:51:10 +00003157 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003158 Py_XDECREF(errorHandler);
3159 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003160 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00003161
Guido van Rossumd57fd912000-03-10 22:53:23 +00003162 onError:
3163 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003164 Py_XDECREF(errorHandler);
3165 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003166 return NULL;
3167}
3168
Guido van Rossumd57fd912000-03-10 22:53:23 +00003169PyObject *PyUnicode_EncodeASCII(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003170 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003171 const char *errors)
3172{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003173 return unicode_encode_ucs1(p, size, errors, 128);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003174}
3175
3176PyObject *PyUnicode_AsASCIIString(PyObject *unicode)
3177{
3178 if (!PyUnicode_Check(unicode)) {
3179 PyErr_BadArgument();
3180 return NULL;
3181 }
3182 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
3183 PyUnicode_GET_SIZE(unicode),
3184 NULL);
3185}
3186
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00003187#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Guido van Rossum2ea3e142000-03-31 17:24:09 +00003188
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003189/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00003190
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003191#if SIZEOF_INT < SIZEOF_SSIZE_T
3192#define NEED_RETRY
3193#endif
3194
3195/* XXX This code is limited to "true" double-byte encodings, as
3196 a) it assumes an incomplete character consists of a single byte, and
3197 b) IsDBCSLeadByte (probably) does not work for non-DBCS multi-byte
3198 encodings, see IsDBCSLeadByteEx documentation. */
3199
3200static int is_dbcs_lead_byte(const char *s, int offset)
3201{
3202 const char *curr = s + offset;
3203
3204 if (IsDBCSLeadByte(*curr)) {
3205 const char *prev = CharPrev(s, curr);
3206 return (prev == curr) || !IsDBCSLeadByte(*prev) || (curr - prev == 2);
3207 }
3208 return 0;
3209}
3210
3211/*
3212 * Decode MBCS string into unicode object. If 'final' is set, converts
3213 * trailing lead-byte too. Returns consumed size if succeed, -1 otherwise.
3214 */
3215static int decode_mbcs(PyUnicodeObject **v,
3216 const char *s, /* MBCS string */
3217 int size, /* sizeof MBCS string */
3218 int final)
3219{
3220 Py_UNICODE *p;
3221 Py_ssize_t n = 0;
3222 int usize = 0;
3223
3224 assert(size >= 0);
3225
3226 /* Skip trailing lead-byte unless 'final' is set */
3227 if (!final && size >= 1 && is_dbcs_lead_byte(s, size - 1))
3228 --size;
3229
3230 /* First get the size of the result */
3231 if (size > 0) {
3232 usize = MultiByteToWideChar(CP_ACP, 0, s, size, NULL, 0);
3233 if (usize == 0) {
3234 PyErr_SetFromWindowsErrWithFilename(0, NULL);
3235 return -1;
3236 }
3237 }
3238
3239 if (*v == NULL) {
3240 /* Create unicode object */
3241 *v = _PyUnicode_New(usize);
3242 if (*v == NULL)
3243 return -1;
3244 }
3245 else {
3246 /* Extend unicode object */
3247 n = PyUnicode_GET_SIZE(*v);
3248 if (_PyUnicode_Resize(v, n + usize) < 0)
3249 return -1;
3250 }
3251
3252 /* Do the conversion */
3253 if (size > 0) {
3254 p = PyUnicode_AS_UNICODE(*v) + n;
3255 if (0 == MultiByteToWideChar(CP_ACP, 0, s, size, p, usize)) {
3256 PyErr_SetFromWindowsErrWithFilename(0, NULL);
3257 return -1;
3258 }
3259 }
3260
3261 return size;
3262}
3263
3264PyObject *PyUnicode_DecodeMBCSStateful(const char *s,
3265 Py_ssize_t size,
3266 const char *errors,
3267 Py_ssize_t *consumed)
3268{
3269 PyUnicodeObject *v = NULL;
3270 int done;
3271
3272 if (consumed)
3273 *consumed = 0;
3274
3275#ifdef NEED_RETRY
3276 retry:
3277 if (size > INT_MAX)
3278 done = decode_mbcs(&v, s, INT_MAX, 0);
3279 else
3280#endif
3281 done = decode_mbcs(&v, s, (int)size, !consumed);
3282
3283 if (done < 0) {
3284 Py_XDECREF(v);
3285 return NULL;
3286 }
3287
3288 if (consumed)
3289 *consumed += done;
3290
3291#ifdef NEED_RETRY
3292 if (size > INT_MAX) {
3293 s += done;
3294 size -= done;
3295 goto retry;
3296 }
3297#endif
3298
3299 return (PyObject *)v;
3300}
3301
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003302PyObject *PyUnicode_DecodeMBCS(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003303 Py_ssize_t size,
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003304 const char *errors)
3305{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003306 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
3307}
3308
3309/*
3310 * Convert unicode into string object (MBCS).
3311 * Returns 0 if succeed, -1 otherwise.
3312 */
3313static int encode_mbcs(PyObject **repr,
3314 const Py_UNICODE *p, /* unicode */
3315 int size) /* size of unicode */
3316{
3317 int mbcssize = 0;
3318 Py_ssize_t n = 0;
3319
3320 assert(size >= 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003321
3322 /* First get the size of the result */
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003323 if (size > 0) {
3324 mbcssize = WideCharToMultiByte(CP_ACP, 0, p, size, NULL, 0, NULL, NULL);
3325 if (mbcssize == 0) {
3326 PyErr_SetFromWindowsErrWithFilename(0, NULL);
3327 return -1;
3328 }
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003329 }
3330
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003331 if (*repr == NULL) {
3332 /* Create string object */
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003333 *repr = PyBytes_FromStringAndSize(NULL, mbcssize);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003334 if (*repr == NULL)
3335 return -1;
3336 }
3337 else {
3338 /* Extend string object */
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003339 n = PyBytes_Size(*repr);
3340 if (PyBytes_Resize(*repr, n + mbcssize) < 0)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003341 return -1;
3342 }
3343
3344 /* Do the conversion */
3345 if (size > 0) {
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003346 char *s = PyBytes_AS_STRING(*repr) + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003347 if (0 == WideCharToMultiByte(CP_ACP, 0, p, size, s, mbcssize, NULL, NULL)) {
3348 PyErr_SetFromWindowsErrWithFilename(0, NULL);
3349 return -1;
3350 }
3351 }
3352
3353 return 0;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003354}
3355
3356PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003357 Py_ssize_t size,
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003358 const char *errors)
3359{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003360 PyObject *repr = NULL;
3361 int ret;
Guido van Rossum03e29f12000-05-04 15:52:20 +00003362
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003363#ifdef NEED_RETRY
3364 retry:
3365 if (size > INT_MAX)
3366 ret = encode_mbcs(&repr, p, INT_MAX);
3367 else
3368#endif
3369 ret = encode_mbcs(&repr, p, (int)size);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003370
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003371 if (ret < 0) {
3372 Py_XDECREF(repr);
3373 return NULL;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003374 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003375
3376#ifdef NEED_RETRY
3377 if (size > INT_MAX) {
3378 p += INT_MAX;
3379 size -= INT_MAX;
3380 goto retry;
3381 }
3382#endif
3383
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003384 return repr;
3385}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00003386
Mark Hammond0ccda1e2003-07-01 00:13:27 +00003387PyObject *PyUnicode_AsMBCSString(PyObject *unicode)
3388{
3389 if (!PyUnicode_Check(unicode)) {
3390 PyErr_BadArgument();
3391 return NULL;
3392 }
3393 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
3394 PyUnicode_GET_SIZE(unicode),
3395 NULL);
3396}
3397
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003398#undef NEED_RETRY
3399
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00003400#endif /* MS_WINDOWS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003401
Guido van Rossumd57fd912000-03-10 22:53:23 +00003402/* --- Character Mapping Codec -------------------------------------------- */
3403
Guido van Rossumd57fd912000-03-10 22:53:23 +00003404PyObject *PyUnicode_DecodeCharmap(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003405 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003406 PyObject *mapping,
3407 const char *errors)
3408{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003409 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003410 Py_ssize_t startinpos;
3411 Py_ssize_t endinpos;
3412 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003413 const char *e;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003414 PyUnicodeObject *v;
3415 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003416 Py_ssize_t extrachars = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003417 PyObject *errorHandler = NULL;
3418 PyObject *exc = NULL;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003419 Py_UNICODE *mapstring = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003420 Py_ssize_t maplen = 0;
Tim Petersced69f82003-09-16 20:30:58 +00003421
Guido van Rossumd57fd912000-03-10 22:53:23 +00003422 /* Default to Latin-1 */
3423 if (mapping == NULL)
3424 return PyUnicode_DecodeLatin1(s, size, errors);
3425
3426 v = _PyUnicode_New(size);
3427 if (v == NULL)
3428 goto onError;
3429 if (size == 0)
3430 return (PyObject *)v;
3431 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003432 e = s + size;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003433 if (PyUnicode_CheckExact(mapping)) {
3434 mapstring = PyUnicode_AS_UNICODE(mapping);
3435 maplen = PyUnicode_GET_SIZE(mapping);
3436 while (s < e) {
3437 unsigned char ch = *s;
3438 Py_UNICODE x = 0xfffe; /* illegal value */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003439
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003440 if (ch < maplen)
3441 x = mapstring[ch];
Guido van Rossumd57fd912000-03-10 22:53:23 +00003442
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003443 if (x == 0xfffe) {
3444 /* undefined mapping */
3445 outpos = p-PyUnicode_AS_UNICODE(v);
3446 startinpos = s-starts;
3447 endinpos = startinpos+1;
3448 if (unicode_decode_call_errorhandler(
3449 errors, &errorHandler,
3450 "charmap", "character maps to <undefined>",
3451 starts, size, &startinpos, &endinpos, &exc, &s,
3452 (PyObject **)&v, &outpos, &p)) {
3453 goto onError;
Marc-André Lemburgec233e52001-01-06 14:59:58 +00003454 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003455 continue;
Marc-André Lemburgec233e52001-01-06 14:59:58 +00003456 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003457 *p++ = x;
3458 ++s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003459 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003460 }
3461 else {
3462 while (s < e) {
3463 unsigned char ch = *s;
3464 PyObject *w, *x;
3465
3466 /* Get mapping (char ordinal -> integer, Unicode char or None) */
3467 w = PyInt_FromLong((long)ch);
3468 if (w == NULL)
3469 goto onError;
3470 x = PyObject_GetItem(mapping, w);
3471 Py_DECREF(w);
3472 if (x == NULL) {
3473 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
3474 /* No mapping found means: mapping is undefined. */
3475 PyErr_Clear();
3476 x = Py_None;
3477 Py_INCREF(x);
3478 } else
3479 goto onError;
3480 }
3481
3482 /* Apply mapping */
3483 if (PyInt_Check(x)) {
3484 long value = PyInt_AS_LONG(x);
3485 if (value < 0 || value > 65535) {
3486 PyErr_SetString(PyExc_TypeError,
3487 "character mapping must be in range(65536)");
3488 Py_DECREF(x);
3489 goto onError;
3490 }
3491 *p++ = (Py_UNICODE)value;
3492 }
3493 else if (x == Py_None) {
3494 /* undefined mapping */
3495 outpos = p-PyUnicode_AS_UNICODE(v);
3496 startinpos = s-starts;
3497 endinpos = startinpos+1;
3498 if (unicode_decode_call_errorhandler(
3499 errors, &errorHandler,
3500 "charmap", "character maps to <undefined>",
3501 starts, size, &startinpos, &endinpos, &exc, &s,
3502 (PyObject **)&v, &outpos, &p)) {
3503 Py_DECREF(x);
3504 goto onError;
3505 }
Walter Dörwaldd4fff172005-11-28 22:15:56 +00003506 Py_DECREF(x);
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003507 continue;
3508 }
3509 else if (PyUnicode_Check(x)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00003510 Py_ssize_t targetsize = PyUnicode_GET_SIZE(x);
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003511
3512 if (targetsize == 1)
3513 /* 1-1 mapping */
3514 *p++ = *PyUnicode_AS_UNICODE(x);
3515
3516 else if (targetsize > 1) {
3517 /* 1-n mapping */
3518 if (targetsize > extrachars) {
3519 /* resize first */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003520 Py_ssize_t oldpos = p - PyUnicode_AS_UNICODE(v);
3521 Py_ssize_t needed = (targetsize - extrachars) + \
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003522 (targetsize << 2);
3523 extrachars += needed;
Thomas Wouters89f507f2006-12-13 04:49:30 +00003524 /* XXX overflow detection missing */
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003525 if (_PyUnicode_Resize(&v,
3526 PyUnicode_GET_SIZE(v) + needed) < 0) {
3527 Py_DECREF(x);
3528 goto onError;
3529 }
3530 p = PyUnicode_AS_UNICODE(v) + oldpos;
3531 }
3532 Py_UNICODE_COPY(p,
3533 PyUnicode_AS_UNICODE(x),
3534 targetsize);
3535 p += targetsize;
3536 extrachars -= targetsize;
3537 }
3538 /* 1-0 mapping: skip the character */
3539 }
3540 else {
3541 /* wrong return value */
3542 PyErr_SetString(PyExc_TypeError,
3543 "character mapping must return integer, None or unicode");
3544 Py_DECREF(x);
3545 goto onError;
3546 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003547 Py_DECREF(x);
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003548 ++s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003549 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003550 }
3551 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003552 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003553 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003554 Py_XDECREF(errorHandler);
3555 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003556 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00003557
Guido van Rossumd57fd912000-03-10 22:53:23 +00003558 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003559 Py_XDECREF(errorHandler);
3560 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003561 Py_XDECREF(v);
3562 return NULL;
3563}
3564
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003565/* Charmap encoding: the lookup table */
3566
3567struct encoding_map{
3568 PyObject_HEAD
3569 unsigned char level1[32];
3570 int count2, count3;
3571 unsigned char level23[1];
3572};
3573
3574static PyObject*
3575encoding_map_size(PyObject *obj, PyObject* args)
3576{
3577 struct encoding_map *map = (struct encoding_map*)obj;
3578 return PyInt_FromLong(sizeof(*map) - 1 + 16*map->count2 +
3579 128*map->count3);
3580}
3581
3582static PyMethodDef encoding_map_methods[] = {
3583 {"size", encoding_map_size, METH_NOARGS,
3584 PyDoc_STR("Return the size (in bytes) of this object") },
3585 { 0 }
3586};
3587
3588static void
3589encoding_map_dealloc(PyObject* o)
3590{
3591 PyObject_FREE(o);
3592}
3593
3594static PyTypeObject EncodingMapType = {
3595 PyObject_HEAD_INIT(NULL)
3596 0, /*ob_size*/
3597 "EncodingMap", /*tp_name*/
3598 sizeof(struct encoding_map), /*tp_basicsize*/
3599 0, /*tp_itemsize*/
3600 /* methods */
3601 encoding_map_dealloc, /*tp_dealloc*/
3602 0, /*tp_print*/
3603 0, /*tp_getattr*/
3604 0, /*tp_setattr*/
3605 0, /*tp_compare*/
3606 0, /*tp_repr*/
3607 0, /*tp_as_number*/
3608 0, /*tp_as_sequence*/
3609 0, /*tp_as_mapping*/
3610 0, /*tp_hash*/
3611 0, /*tp_call*/
3612 0, /*tp_str*/
3613 0, /*tp_getattro*/
3614 0, /*tp_setattro*/
3615 0, /*tp_as_buffer*/
3616 Py_TPFLAGS_DEFAULT, /*tp_flags*/
3617 0, /*tp_doc*/
3618 0, /*tp_traverse*/
3619 0, /*tp_clear*/
3620 0, /*tp_richcompare*/
3621 0, /*tp_weaklistoffset*/
3622 0, /*tp_iter*/
3623 0, /*tp_iternext*/
3624 encoding_map_methods, /*tp_methods*/
3625 0, /*tp_members*/
3626 0, /*tp_getset*/
3627 0, /*tp_base*/
3628 0, /*tp_dict*/
3629 0, /*tp_descr_get*/
3630 0, /*tp_descr_set*/
3631 0, /*tp_dictoffset*/
3632 0, /*tp_init*/
3633 0, /*tp_alloc*/
3634 0, /*tp_new*/
3635 0, /*tp_free*/
3636 0, /*tp_is_gc*/
3637};
3638
3639PyObject*
3640PyUnicode_BuildEncodingMap(PyObject* string)
3641{
3642 Py_UNICODE *decode;
3643 PyObject *result;
3644 struct encoding_map *mresult;
3645 int i;
3646 int need_dict = 0;
3647 unsigned char level1[32];
3648 unsigned char level2[512];
3649 unsigned char *mlevel1, *mlevel2, *mlevel3;
3650 int count2 = 0, count3 = 0;
3651
3652 if (!PyUnicode_Check(string) || PyUnicode_GetSize(string) != 256) {
3653 PyErr_BadArgument();
3654 return NULL;
3655 }
3656 decode = PyUnicode_AS_UNICODE(string);
3657 memset(level1, 0xFF, sizeof level1);
3658 memset(level2, 0xFF, sizeof level2);
3659
3660 /* If there isn't a one-to-one mapping of NULL to \0,
3661 or if there are non-BMP characters, we need to use
3662 a mapping dictionary. */
3663 if (decode[0] != 0)
3664 need_dict = 1;
3665 for (i = 1; i < 256; i++) {
3666 int l1, l2;
3667 if (decode[i] == 0
3668 #ifdef Py_UNICODE_WIDE
3669 || decode[i] > 0xFFFF
3670 #endif
3671 ) {
3672 need_dict = 1;
3673 break;
3674 }
3675 if (decode[i] == 0xFFFE)
3676 /* unmapped character */
3677 continue;
3678 l1 = decode[i] >> 11;
3679 l2 = decode[i] >> 7;
3680 if (level1[l1] == 0xFF)
3681 level1[l1] = count2++;
3682 if (level2[l2] == 0xFF)
3683 level2[l2] = count3++;
3684 }
3685
3686 if (count2 >= 0xFF || count3 >= 0xFF)
3687 need_dict = 1;
3688
3689 if (need_dict) {
3690 PyObject *result = PyDict_New();
3691 PyObject *key, *value;
3692 if (!result)
3693 return NULL;
3694 for (i = 0; i < 256; i++) {
3695 key = value = NULL;
3696 key = PyInt_FromLong(decode[i]);
3697 value = PyInt_FromLong(i);
3698 if (!key || !value)
3699 goto failed1;
3700 if (PyDict_SetItem(result, key, value) == -1)
3701 goto failed1;
3702 Py_DECREF(key);
3703 Py_DECREF(value);
3704 }
3705 return result;
3706 failed1:
3707 Py_XDECREF(key);
3708 Py_XDECREF(value);
3709 Py_DECREF(result);
3710 return NULL;
3711 }
3712
3713 /* Create a three-level trie */
3714 result = PyObject_MALLOC(sizeof(struct encoding_map) +
3715 16*count2 + 128*count3 - 1);
3716 if (!result)
3717 return PyErr_NoMemory();
3718 PyObject_Init(result, &EncodingMapType);
3719 mresult = (struct encoding_map*)result;
3720 mresult->count2 = count2;
3721 mresult->count3 = count3;
3722 mlevel1 = mresult->level1;
3723 mlevel2 = mresult->level23;
3724 mlevel3 = mresult->level23 + 16*count2;
3725 memcpy(mlevel1, level1, 32);
3726 memset(mlevel2, 0xFF, 16*count2);
3727 memset(mlevel3, 0, 128*count3);
3728 count3 = 0;
3729 for (i = 1; i < 256; i++) {
3730 int o1, o2, o3, i2, i3;
3731 if (decode[i] == 0xFFFE)
3732 /* unmapped character */
3733 continue;
3734 o1 = decode[i]>>11;
3735 o2 = (decode[i]>>7) & 0xF;
3736 i2 = 16*mlevel1[o1] + o2;
3737 if (mlevel2[i2] == 0xFF)
3738 mlevel2[i2] = count3++;
3739 o3 = decode[i] & 0x7F;
3740 i3 = 128*mlevel2[i2] + o3;
3741 mlevel3[i3] = i;
3742 }
3743 return result;
3744}
3745
3746static int
3747encoding_map_lookup(Py_UNICODE c, PyObject *mapping)
3748{
3749 struct encoding_map *map = (struct encoding_map*)mapping;
3750 int l1 = c>>11;
3751 int l2 = (c>>7) & 0xF;
3752 int l3 = c & 0x7F;
3753 int i;
3754
3755#ifdef Py_UNICODE_WIDE
3756 if (c > 0xFFFF) {
3757 return -1;
3758 }
3759#endif
3760 if (c == 0)
3761 return 0;
3762 /* level 1*/
3763 i = map->level1[l1];
3764 if (i == 0xFF) {
3765 return -1;
3766 }
3767 /* level 2*/
3768 i = map->level23[16*i+l2];
3769 if (i == 0xFF) {
3770 return -1;
3771 }
3772 /* level 3 */
3773 i = map->level23[16*map->count2 + 128*i + l3];
3774 if (i == 0) {
3775 return -1;
3776 }
3777 return i;
3778}
3779
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003780/* Lookup the character ch in the mapping. If the character
3781 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00003782 error occurred). */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003783static PyObject *charmapencode_lookup(Py_UNICODE c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003784{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003785 PyObject *w = PyInt_FromLong((long)c);
3786 PyObject *x;
3787
3788 if (w == NULL)
3789 return NULL;
3790 x = PyObject_GetItem(mapping, w);
3791 Py_DECREF(w);
3792 if (x == NULL) {
3793 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
3794 /* No mapping found means: mapping is undefined. */
3795 PyErr_Clear();
3796 x = Py_None;
3797 Py_INCREF(x);
3798 return x;
3799 } else
3800 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003801 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00003802 else if (x == Py_None)
3803 return x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003804 else if (PyInt_Check(x)) {
3805 long value = PyInt_AS_LONG(x);
3806 if (value < 0 || value > 255) {
3807 PyErr_SetString(PyExc_TypeError,
3808 "character mapping must be in range(256)");
3809 Py_DECREF(x);
3810 return NULL;
3811 }
3812 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003813 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003814 else if (PyString_Check(x))
3815 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003816 else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003817 /* wrong return value */
Walter Dörwald580ceed2007-05-09 10:39:19 +00003818 PyErr_Format(PyExc_TypeError,
3819 "character mapping must return integer, None or str8, not %.400s",
3820 x->ob_type->tp_name);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003821 Py_DECREF(x);
3822 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003823 }
3824}
3825
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003826static int
Walter Dörwald827b0552007-05-12 13:23:53 +00003827charmapencode_resize(PyObject *outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003828{
Walter Dörwald827b0552007-05-12 13:23:53 +00003829 Py_ssize_t outsize = PyBytes_GET_SIZE( outobj);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003830 /* exponentially overallocate to minimize reallocations */
3831 if (requiredsize < 2*outsize)
3832 requiredsize = 2*outsize;
Walter Dörwald827b0552007-05-12 13:23:53 +00003833 if (PyBytes_Resize(outobj, requiredsize)) {
3834 Py_DECREF(outobj);
3835 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003836 }
Walter Dörwald827b0552007-05-12 13:23:53 +00003837 return 0;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003838}
3839
3840typedef enum charmapencode_result {
3841 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
3842}charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003843/* lookup the character, put the result in the output string and adjust
Walter Dörwald827b0552007-05-12 13:23:53 +00003844 various state variables. Resize the output bytes object if not enough
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003845 space is available. Return a new reference to the object that
3846 was put in the output buffer, or Py_None, if the mapping was undefined
3847 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00003848 reallocation error occurred. The caller must decref the result */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003849static
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003850charmapencode_result charmapencode_output(Py_UNICODE c, PyObject *mapping,
Walter Dörwald827b0552007-05-12 13:23:53 +00003851 PyObject *outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003852{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003853 PyObject *rep;
3854 char *outstart;
Walter Dörwald827b0552007-05-12 13:23:53 +00003855 Py_ssize_t outsize = PyBytes_GET_SIZE(outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003856
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003857 if (mapping->ob_type == &EncodingMapType) {
3858 int res = encoding_map_lookup(c, mapping);
3859 Py_ssize_t requiredsize = *outpos+1;
3860 if (res == -1)
3861 return enc_FAILED;
3862 if (outsize<requiredsize)
Walter Dörwald827b0552007-05-12 13:23:53 +00003863 if (charmapencode_resize(outobj, outpos, requiredsize))
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003864 return enc_EXCEPTION;
Walter Dörwald827b0552007-05-12 13:23:53 +00003865 outstart = PyBytes_AS_STRING(outobj);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003866 outstart[(*outpos)++] = (char)res;
3867 return enc_SUCCESS;
3868 }
3869
3870 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003871 if (rep==NULL)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003872 return enc_EXCEPTION;
3873 else if (rep==Py_None) {
3874 Py_DECREF(rep);
3875 return enc_FAILED;
3876 } else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003877 if (PyInt_Check(rep)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00003878 Py_ssize_t requiredsize = *outpos+1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003879 if (outsize<requiredsize)
Walter Dörwald827b0552007-05-12 13:23:53 +00003880 if (charmapencode_resize(outobj, outpos, requiredsize)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003881 Py_DECREF(rep);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003882 return enc_EXCEPTION;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003883 }
Walter Dörwald827b0552007-05-12 13:23:53 +00003884 outstart = PyBytes_AS_STRING(outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003885 outstart[(*outpos)++] = (char)PyInt_AS_LONG(rep);
3886 }
3887 else {
3888 const char *repchars = PyString_AS_STRING(rep);
Martin v. Löwis18e16552006-02-15 17:27:45 +00003889 Py_ssize_t repsize = PyString_GET_SIZE(rep);
3890 Py_ssize_t requiredsize = *outpos+repsize;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003891 if (outsize<requiredsize)
Walter Dörwald827b0552007-05-12 13:23:53 +00003892 if (charmapencode_resize(outobj, outpos, requiredsize)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003893 Py_DECREF(rep);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003894 return enc_EXCEPTION;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003895 }
Walter Dörwald827b0552007-05-12 13:23:53 +00003896 outstart = PyBytes_AS_STRING(outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003897 memcpy(outstart + *outpos, repchars, repsize);
3898 *outpos += repsize;
3899 }
3900 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003901 Py_DECREF(rep);
3902 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003903}
3904
3905/* handle an error in PyUnicode_EncodeCharmap
3906 Return 0 on success, -1 on error */
3907static
3908int charmap_encoding_error(
Martin v. Löwis18e16552006-02-15 17:27:45 +00003909 const Py_UNICODE *p, Py_ssize_t size, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003910 PyObject **exceptionObject,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00003911 int *known_errorHandler, PyObject **errorHandler, const char *errors,
Walter Dörwald827b0552007-05-12 13:23:53 +00003912 PyObject *res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003913{
3914 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003915 Py_ssize_t repsize;
3916 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003917 Py_UNICODE *uni2;
3918 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003919 Py_ssize_t collstartpos = *inpos;
3920 Py_ssize_t collendpos = *inpos+1;
3921 Py_ssize_t collpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003922 char *encoding = "charmap";
3923 char *reason = "character maps to <undefined>";
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003924 charmapencode_result x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003925
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003926 /* find all unencodable characters */
3927 while (collendpos < size) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003928 PyObject *rep;
3929 if (mapping->ob_type == &EncodingMapType) {
3930 int res = encoding_map_lookup(p[collendpos], mapping);
3931 if (res != -1)
3932 break;
3933 ++collendpos;
3934 continue;
3935 }
3936
3937 rep = charmapencode_lookup(p[collendpos], mapping);
3938 if (rep==NULL)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003939 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003940 else if (rep!=Py_None) {
3941 Py_DECREF(rep);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003942 break;
3943 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003944 Py_DECREF(rep);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003945 ++collendpos;
3946 }
3947 /* cache callback name lookup
3948 * (if not done yet, i.e. it's the first error) */
3949 if (*known_errorHandler==-1) {
3950 if ((errors==NULL) || (!strcmp(errors, "strict")))
3951 *known_errorHandler = 1;
3952 else if (!strcmp(errors, "replace"))
3953 *known_errorHandler = 2;
3954 else if (!strcmp(errors, "ignore"))
3955 *known_errorHandler = 3;
3956 else if (!strcmp(errors, "xmlcharrefreplace"))
3957 *known_errorHandler = 4;
3958 else
3959 *known_errorHandler = 0;
3960 }
3961 switch (*known_errorHandler) {
3962 case 1: /* strict */
3963 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
3964 return -1;
3965 case 2: /* replace */
3966 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
3967 x = charmapencode_output('?', mapping, res, respos);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003968 if (x==enc_EXCEPTION) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003969 return -1;
3970 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003971 else if (x==enc_FAILED) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003972 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
3973 return -1;
3974 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003975 }
3976 /* fall through */
3977 case 3: /* ignore */
3978 *inpos = collendpos;
3979 break;
3980 case 4: /* xmlcharrefreplace */
3981 /* generate replacement (temporarily (mis)uses p) */
3982 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
3983 char buffer[2+29+1+1];
3984 char *cp;
3985 sprintf(buffer, "&#%d;", (int)p[collpos]);
3986 for (cp = buffer; *cp; ++cp) {
3987 x = charmapencode_output(*cp, mapping, res, respos);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003988 if (x==enc_EXCEPTION)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003989 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003990 else if (x==enc_FAILED) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003991 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
3992 return -1;
3993 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003994 }
3995 }
3996 *inpos = collendpos;
3997 break;
3998 default:
Walter Dörwalde5402fb2003-08-14 20:25:29 +00003999 repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004000 encoding, reason, p, size, exceptionObject,
4001 collstartpos, collendpos, &newpos);
4002 if (repunicode == NULL)
4003 return -1;
4004 /* generate replacement */
4005 repsize = PyUnicode_GET_SIZE(repunicode);
4006 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
4007 x = charmapencode_output(*uni2, mapping, res, respos);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004008 if (x==enc_EXCEPTION) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004009 return -1;
4010 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004011 else if (x==enc_FAILED) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004012 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004013 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4014 return -1;
4015 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004016 }
4017 *inpos = newpos;
4018 Py_DECREF(repunicode);
4019 }
4020 return 0;
4021}
4022
Guido van Rossumd57fd912000-03-10 22:53:23 +00004023PyObject *PyUnicode_EncodeCharmap(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004024 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004025 PyObject *mapping,
4026 const char *errors)
4027{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004028 /* output object */
4029 PyObject *res = NULL;
4030 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004031 Py_ssize_t inpos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004032 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004033 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004034 PyObject *errorHandler = NULL;
4035 PyObject *exc = NULL;
4036 /* the following variable is used for caching string comparisons
4037 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
4038 * 3=ignore, 4=xmlcharrefreplace */
4039 int known_errorHandler = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004040
4041 /* Default to Latin-1 */
4042 if (mapping == NULL)
4043 return PyUnicode_EncodeLatin1(p, size, errors);
4044
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004045 /* allocate enough for a simple encoding without
4046 replacements, if we need more, we'll resize */
Walter Dörwald827b0552007-05-12 13:23:53 +00004047 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004048 if (res == NULL)
4049 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00004050 if (size == 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004051 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004052
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004053 while (inpos<size) {
4054 /* try to encode it */
Walter Dörwald827b0552007-05-12 13:23:53 +00004055 charmapencode_result x = charmapencode_output(p[inpos], mapping, res, &respos);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004056 if (x==enc_EXCEPTION) /* error */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004057 goto onError;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004058 if (x==enc_FAILED) { /* unencodable character */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004059 if (charmap_encoding_error(p, size, &inpos, mapping,
4060 &exc,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00004061 &known_errorHandler, &errorHandler, errors,
Walter Dörwald827b0552007-05-12 13:23:53 +00004062 res, &respos)) {
Marc-André Lemburg3a645e42001-01-16 11:54:12 +00004063 goto onError;
Walter Dörwald9b30f202003-08-15 16:26:34 +00004064 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004065 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004066 else
4067 /* done with this character => adjust input position */
4068 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004069 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004070
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004071 /* Resize if we allocated to much */
Walter Dörwald827b0552007-05-12 13:23:53 +00004072 if (respos<PyBytes_GET_SIZE(res)) {
4073 if (PyBytes_Resize(res, respos))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004074 goto onError;
4075 }
4076 Py_XDECREF(exc);
4077 Py_XDECREF(errorHandler);
4078 return res;
4079
4080 onError:
4081 Py_XDECREF(res);
4082 Py_XDECREF(exc);
4083 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004084 return NULL;
4085}
4086
4087PyObject *PyUnicode_AsCharmapString(PyObject *unicode,
4088 PyObject *mapping)
4089{
4090 if (!PyUnicode_Check(unicode) || mapping == NULL) {
4091 PyErr_BadArgument();
4092 return NULL;
4093 }
4094 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
4095 PyUnicode_GET_SIZE(unicode),
4096 mapping,
4097 NULL);
4098}
4099
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004100/* create or adjust a UnicodeTranslateError */
4101static void make_translate_exception(PyObject **exceptionObject,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004102 const Py_UNICODE *unicode, Py_ssize_t size,
4103 Py_ssize_t startpos, Py_ssize_t endpos,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004104 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004105{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004106 if (*exceptionObject == NULL) {
4107 *exceptionObject = PyUnicodeTranslateError_Create(
4108 unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004109 }
4110 else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004111 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
4112 goto onError;
4113 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
4114 goto onError;
4115 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
4116 goto onError;
4117 return;
4118 onError:
4119 Py_DECREF(*exceptionObject);
4120 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004121 }
4122}
4123
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004124/* raises a UnicodeTranslateError */
4125static void raise_translate_exception(PyObject **exceptionObject,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004126 const Py_UNICODE *unicode, Py_ssize_t size,
4127 Py_ssize_t startpos, Py_ssize_t endpos,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004128 const char *reason)
4129{
4130 make_translate_exception(exceptionObject,
4131 unicode, size, startpos, endpos, reason);
4132 if (*exceptionObject != NULL)
4133 PyCodec_StrictErrors(*exceptionObject);
4134}
4135
4136/* error handling callback helper:
4137 build arguments, call the callback and check the arguments,
4138 put the result into newpos and return the replacement string, which
4139 has to be freed by the caller */
4140static PyObject *unicode_translate_call_errorhandler(const char *errors,
4141 PyObject **errorHandler,
4142 const char *reason,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004143 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
4144 Py_ssize_t startpos, Py_ssize_t endpos,
4145 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004146{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004147 static char *argparse = "O!n;translating error handler must return (unicode, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004148
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004149 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004150 PyObject *restuple;
4151 PyObject *resunicode;
4152
4153 if (*errorHandler == NULL) {
4154 *errorHandler = PyCodec_LookupError(errors);
4155 if (*errorHandler == NULL)
4156 return NULL;
4157 }
4158
4159 make_translate_exception(exceptionObject,
4160 unicode, size, startpos, endpos, reason);
4161 if (*exceptionObject == NULL)
4162 return NULL;
4163
4164 restuple = PyObject_CallFunctionObjArgs(
4165 *errorHandler, *exceptionObject, NULL);
4166 if (restuple == NULL)
4167 return NULL;
4168 if (!PyTuple_Check(restuple)) {
4169 PyErr_Format(PyExc_TypeError, &argparse[4]);
4170 Py_DECREF(restuple);
4171 return NULL;
4172 }
4173 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004174 &resunicode, &i_newpos)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004175 Py_DECREF(restuple);
4176 return NULL;
4177 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00004178 if (i_newpos<0)
4179 *newpos = size+i_newpos;
4180 else
4181 *newpos = i_newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004182 if (*newpos<0 || *newpos>size) {
Martin v. Löwis2c95cc62006-02-16 06:54:25 +00004183 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004184 Py_DECREF(restuple);
4185 return NULL;
4186 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004187 Py_INCREF(resunicode);
4188 Py_DECREF(restuple);
4189 return resunicode;
4190}
4191
4192/* Lookup the character ch in the mapping and put the result in result,
4193 which must be decrefed by the caller.
4194 Return 0 on success, -1 on error */
4195static
4196int charmaptranslate_lookup(Py_UNICODE c, PyObject *mapping, PyObject **result)
4197{
4198 PyObject *w = PyInt_FromLong((long)c);
4199 PyObject *x;
4200
4201 if (w == NULL)
4202 return -1;
4203 x = PyObject_GetItem(mapping, w);
4204 Py_DECREF(w);
4205 if (x == NULL) {
4206 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
4207 /* No mapping found means: use 1:1 mapping. */
4208 PyErr_Clear();
4209 *result = NULL;
4210 return 0;
4211 } else
4212 return -1;
4213 }
4214 else if (x == Py_None) {
4215 *result = x;
4216 return 0;
4217 }
4218 else if (PyInt_Check(x)) {
4219 long value = PyInt_AS_LONG(x);
4220 long max = PyUnicode_GetMax();
4221 if (value < 0 || value > max) {
4222 PyErr_Format(PyExc_TypeError,
4223 "character mapping must be in range(0x%lx)", max+1);
4224 Py_DECREF(x);
4225 return -1;
4226 }
4227 *result = x;
4228 return 0;
4229 }
4230 else if (PyUnicode_Check(x)) {
4231 *result = x;
4232 return 0;
4233 }
4234 else {
4235 /* wrong return value */
4236 PyErr_SetString(PyExc_TypeError,
4237 "character mapping must return integer, None or unicode");
Walter Dörwald150523e2003-08-15 16:52:19 +00004238 Py_DECREF(x);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004239 return -1;
4240 }
4241}
4242/* ensure that *outobj is at least requiredsize characters long,
4243if not reallocate and adjust various state variables.
4244Return 0 on success, -1 on error */
4245static
Walter Dörwald4894c302003-10-24 14:25:28 +00004246int charmaptranslate_makespace(PyObject **outobj, Py_UNICODE **outp,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004247 Py_ssize_t requiredsize)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004248{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004249 Py_ssize_t oldsize = PyUnicode_GET_SIZE(*outobj);
Walter Dörwald4894c302003-10-24 14:25:28 +00004250 if (requiredsize > oldsize) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004251 /* remember old output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004252 Py_ssize_t outpos = *outp-PyUnicode_AS_UNICODE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004253 /* exponentially overallocate to minimize reallocations */
Walter Dörwald4894c302003-10-24 14:25:28 +00004254 if (requiredsize < 2 * oldsize)
4255 requiredsize = 2 * oldsize;
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00004256 if (_PyUnicode_Resize(outobj, requiredsize) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004257 return -1;
4258 *outp = PyUnicode_AS_UNICODE(*outobj) + outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004259 }
4260 return 0;
4261}
4262/* lookup the character, put the result in the output string and adjust
4263 various state variables. Return a new reference to the object that
4264 was put in the output buffer in *result, or Py_None, if the mapping was
4265 undefined (in which case no character was written).
4266 The called must decref result.
4267 Return 0 on success, -1 on error. */
4268static
Walter Dörwald4894c302003-10-24 14:25:28 +00004269int charmaptranslate_output(const Py_UNICODE *startinp, const Py_UNICODE *curinp,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004270 Py_ssize_t insize, PyObject *mapping, PyObject **outobj, Py_UNICODE **outp,
Walter Dörwald4894c302003-10-24 14:25:28 +00004271 PyObject **res)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004272{
Walter Dörwald4894c302003-10-24 14:25:28 +00004273 if (charmaptranslate_lookup(*curinp, mapping, res))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004274 return -1;
4275 if (*res==NULL) {
4276 /* not found => default to 1:1 mapping */
Walter Dörwald4894c302003-10-24 14:25:28 +00004277 *(*outp)++ = *curinp;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004278 }
4279 else if (*res==Py_None)
4280 ;
4281 else if (PyInt_Check(*res)) {
4282 /* no overflow check, because we know that the space is enough */
4283 *(*outp)++ = (Py_UNICODE)PyInt_AS_LONG(*res);
4284 }
4285 else if (PyUnicode_Check(*res)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00004286 Py_ssize_t repsize = PyUnicode_GET_SIZE(*res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004287 if (repsize==1) {
4288 /* no overflow check, because we know that the space is enough */
4289 *(*outp)++ = *PyUnicode_AS_UNICODE(*res);
4290 }
4291 else if (repsize!=0) {
4292 /* more than one character */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004293 Py_ssize_t requiredsize = (*outp-PyUnicode_AS_UNICODE(*outobj)) +
Walter Dörwaldcd736e72004-02-05 17:36:00 +00004294 (insize - (curinp-startinp)) +
Walter Dörwald4894c302003-10-24 14:25:28 +00004295 repsize - 1;
4296 if (charmaptranslate_makespace(outobj, outp, requiredsize))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004297 return -1;
4298 memcpy(*outp, PyUnicode_AS_UNICODE(*res), sizeof(Py_UNICODE)*repsize);
4299 *outp += repsize;
4300 }
4301 }
4302 else
4303 return -1;
4304 return 0;
4305}
4306
4307PyObject *PyUnicode_TranslateCharmap(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004308 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004309 PyObject *mapping,
4310 const char *errors)
4311{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004312 /* output object */
4313 PyObject *res = NULL;
4314 /* pointers to the beginning and end+1 of input */
4315 const Py_UNICODE *startp = p;
4316 const Py_UNICODE *endp = p + size;
4317 /* pointer into the output */
4318 Py_UNICODE *str;
4319 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004320 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004321 char *reason = "character maps to <undefined>";
4322 PyObject *errorHandler = NULL;
4323 PyObject *exc = NULL;
4324 /* the following variable is used for caching string comparisons
4325 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
4326 * 3=ignore, 4=xmlcharrefreplace */
4327 int known_errorHandler = -1;
4328
Guido van Rossumd57fd912000-03-10 22:53:23 +00004329 if (mapping == NULL) {
4330 PyErr_BadArgument();
4331 return NULL;
4332 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004333
4334 /* allocate enough for a simple 1:1 translation without
4335 replacements, if we need more, we'll resize */
4336 res = PyUnicode_FromUnicode(NULL, size);
4337 if (res == NULL)
Walter Dörwald4894c302003-10-24 14:25:28 +00004338 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004339 if (size == 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004340 return res;
4341 str = PyUnicode_AS_UNICODE(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004342
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004343 while (p<endp) {
4344 /* try to encode it */
4345 PyObject *x = NULL;
Walter Dörwald4894c302003-10-24 14:25:28 +00004346 if (charmaptranslate_output(startp, p, size, mapping, &res, &str, &x)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004347 Py_XDECREF(x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004348 goto onError;
4349 }
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00004350 Py_XDECREF(x);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004351 if (x!=Py_None) /* it worked => adjust input pointer */
4352 ++p;
4353 else { /* untranslatable character */
4354 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004355 Py_ssize_t repsize;
4356 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004357 Py_UNICODE *uni2;
4358 /* startpos for collecting untranslatable chars */
4359 const Py_UNICODE *collstart = p;
4360 const Py_UNICODE *collend = p+1;
4361 const Py_UNICODE *coll;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004362
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004363 /* find all untranslatable characters */
4364 while (collend < endp) {
Walter Dörwald4894c302003-10-24 14:25:28 +00004365 if (charmaptranslate_lookup(*collend, mapping, &x))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004366 goto onError;
4367 Py_XDECREF(x);
4368 if (x!=Py_None)
4369 break;
4370 ++collend;
4371 }
4372 /* cache callback name lookup
4373 * (if not done yet, i.e. it's the first error) */
4374 if (known_errorHandler==-1) {
4375 if ((errors==NULL) || (!strcmp(errors, "strict")))
4376 known_errorHandler = 1;
4377 else if (!strcmp(errors, "replace"))
4378 known_errorHandler = 2;
4379 else if (!strcmp(errors, "ignore"))
4380 known_errorHandler = 3;
4381 else if (!strcmp(errors, "xmlcharrefreplace"))
4382 known_errorHandler = 4;
4383 else
4384 known_errorHandler = 0;
4385 }
4386 switch (known_errorHandler) {
4387 case 1: /* strict */
4388 raise_translate_exception(&exc, startp, size, collstart-startp, collend-startp, reason);
4389 goto onError;
4390 case 2: /* replace */
4391 /* No need to check for space, this is a 1:1 replacement */
4392 for (coll = collstart; coll<collend; ++coll)
4393 *str++ = '?';
4394 /* fall through */
4395 case 3: /* ignore */
4396 p = collend;
4397 break;
4398 case 4: /* xmlcharrefreplace */
4399 /* generate replacement (temporarily (mis)uses p) */
4400 for (p = collstart; p < collend; ++p) {
4401 char buffer[2+29+1+1];
4402 char *cp;
4403 sprintf(buffer, "&#%d;", (int)*p);
Walter Dörwald4894c302003-10-24 14:25:28 +00004404 if (charmaptranslate_makespace(&res, &str,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004405 (str-PyUnicode_AS_UNICODE(res))+strlen(buffer)+(endp-collend)))
4406 goto onError;
4407 for (cp = buffer; *cp; ++cp)
4408 *str++ = *cp;
4409 }
4410 p = collend;
4411 break;
4412 default:
4413 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
4414 reason, startp, size, &exc,
4415 collstart-startp, collend-startp, &newpos);
4416 if (repunicode == NULL)
4417 goto onError;
4418 /* generate replacement */
4419 repsize = PyUnicode_GET_SIZE(repunicode);
Walter Dörwald4894c302003-10-24 14:25:28 +00004420 if (charmaptranslate_makespace(&res, &str,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004421 (str-PyUnicode_AS_UNICODE(res))+repsize+(endp-collend))) {
4422 Py_DECREF(repunicode);
4423 goto onError;
4424 }
4425 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2)
4426 *str++ = *uni2;
4427 p = startp + newpos;
4428 Py_DECREF(repunicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004429 }
4430 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004431 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004432 /* Resize if we allocated to much */
4433 respos = str-PyUnicode_AS_UNICODE(res);
Walter Dörwald4894c302003-10-24 14:25:28 +00004434 if (respos<PyUnicode_GET_SIZE(res)) {
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00004435 if (_PyUnicode_Resize(&res, respos) < 0)
Guido van Rossumfd4b9572000-04-10 13:51:10 +00004436 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004437 }
4438 Py_XDECREF(exc);
4439 Py_XDECREF(errorHandler);
4440 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004441
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004442 onError:
4443 Py_XDECREF(res);
4444 Py_XDECREF(exc);
4445 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004446 return NULL;
4447}
4448
4449PyObject *PyUnicode_Translate(PyObject *str,
4450 PyObject *mapping,
4451 const char *errors)
4452{
4453 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00004454
Guido van Rossumd57fd912000-03-10 22:53:23 +00004455 str = PyUnicode_FromObject(str);
4456 if (str == NULL)
4457 goto onError;
4458 result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str),
4459 PyUnicode_GET_SIZE(str),
4460 mapping,
4461 errors);
4462 Py_DECREF(str);
4463 return result;
Tim Petersced69f82003-09-16 20:30:58 +00004464
Guido van Rossumd57fd912000-03-10 22:53:23 +00004465 onError:
4466 Py_XDECREF(str);
4467 return NULL;
4468}
Tim Petersced69f82003-09-16 20:30:58 +00004469
Guido van Rossum9e896b32000-04-05 20:11:21 +00004470/* --- Decimal Encoder ---------------------------------------------------- */
4471
4472int PyUnicode_EncodeDecimal(Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004473 Py_ssize_t length,
Guido van Rossum9e896b32000-04-05 20:11:21 +00004474 char *output,
4475 const char *errors)
4476{
4477 Py_UNICODE *p, *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004478 PyObject *errorHandler = NULL;
4479 PyObject *exc = NULL;
4480 const char *encoding = "decimal";
4481 const char *reason = "invalid decimal Unicode string";
4482 /* the following variable is used for caching string comparisons
4483 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
4484 int known_errorHandler = -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00004485
4486 if (output == NULL) {
4487 PyErr_BadArgument();
4488 return -1;
4489 }
4490
4491 p = s;
4492 end = s + length;
4493 while (p < end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004494 register Py_UNICODE ch = *p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00004495 int decimal;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004496 PyObject *repunicode;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004497 Py_ssize_t repsize;
4498 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004499 Py_UNICODE *uni2;
4500 Py_UNICODE *collstart;
4501 Py_UNICODE *collend;
Tim Petersced69f82003-09-16 20:30:58 +00004502
Guido van Rossum9e896b32000-04-05 20:11:21 +00004503 if (Py_UNICODE_ISSPACE(ch)) {
4504 *output++ = ' ';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004505 ++p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00004506 continue;
4507 }
4508 decimal = Py_UNICODE_TODECIMAL(ch);
4509 if (decimal >= 0) {
4510 *output++ = '0' + decimal;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004511 ++p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00004512 continue;
4513 }
Guido van Rossumba477042000-04-06 18:18:10 +00004514 if (0 < ch && ch < 256) {
Guido van Rossum42c29aa2000-05-03 23:58:29 +00004515 *output++ = (char)ch;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004516 ++p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00004517 continue;
4518 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004519 /* All other characters are considered unencodable */
4520 collstart = p;
4521 collend = p+1;
4522 while (collend < end) {
4523 if ((0 < *collend && *collend < 256) ||
4524 !Py_UNICODE_ISSPACE(*collend) ||
4525 Py_UNICODE_TODECIMAL(*collend))
4526 break;
Guido van Rossum9e896b32000-04-05 20:11:21 +00004527 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004528 /* cache callback name lookup
4529 * (if not done yet, i.e. it's the first error) */
4530 if (known_errorHandler==-1) {
4531 if ((errors==NULL) || (!strcmp(errors, "strict")))
4532 known_errorHandler = 1;
4533 else if (!strcmp(errors, "replace"))
4534 known_errorHandler = 2;
4535 else if (!strcmp(errors, "ignore"))
4536 known_errorHandler = 3;
4537 else if (!strcmp(errors, "xmlcharrefreplace"))
4538 known_errorHandler = 4;
4539 else
4540 known_errorHandler = 0;
4541 }
4542 switch (known_errorHandler) {
4543 case 1: /* strict */
4544 raise_encode_exception(&exc, encoding, s, length, collstart-s, collend-s, reason);
4545 goto onError;
4546 case 2: /* replace */
4547 for (p = collstart; p < collend; ++p)
4548 *output++ = '?';
4549 /* fall through */
4550 case 3: /* ignore */
4551 p = collend;
4552 break;
4553 case 4: /* xmlcharrefreplace */
4554 /* generate replacement (temporarily (mis)uses p) */
4555 for (p = collstart; p < collend; ++p)
4556 output += sprintf(output, "&#%d;", (int)*p);
4557 p = collend;
4558 break;
4559 default:
4560 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
4561 encoding, reason, s, length, &exc,
4562 collstart-s, collend-s, &newpos);
4563 if (repunicode == NULL)
4564 goto onError;
4565 /* generate replacement */
4566 repsize = PyUnicode_GET_SIZE(repunicode);
4567 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
4568 Py_UNICODE ch = *uni2;
4569 if (Py_UNICODE_ISSPACE(ch))
4570 *output++ = ' ';
4571 else {
4572 decimal = Py_UNICODE_TODECIMAL(ch);
4573 if (decimal >= 0)
4574 *output++ = '0' + decimal;
4575 else if (0 < ch && ch < 256)
4576 *output++ = (char)ch;
4577 else {
4578 Py_DECREF(repunicode);
4579 raise_encode_exception(&exc, encoding,
4580 s, length, collstart-s, collend-s, reason);
4581 goto onError;
4582 }
4583 }
4584 }
4585 p = s + newpos;
4586 Py_DECREF(repunicode);
Guido van Rossum9e896b32000-04-05 20:11:21 +00004587 }
4588 }
4589 /* 0-terminate the output string */
4590 *output++ = '\0';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004591 Py_XDECREF(exc);
4592 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00004593 return 0;
4594
4595 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004596 Py_XDECREF(exc);
4597 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00004598 return -1;
4599}
4600
Guido van Rossumd57fd912000-03-10 22:53:23 +00004601/* --- Helpers ------------------------------------------------------------ */
4602
Thomas Wouters477c8d52006-05-27 19:21:47 +00004603#define STRINGLIB_CHAR Py_UNICODE
4604
4605#define STRINGLIB_LEN PyUnicode_GET_SIZE
4606#define STRINGLIB_NEW PyUnicode_FromUnicode
4607#define STRINGLIB_STR PyUnicode_AS_UNICODE
4608
4609Py_LOCAL_INLINE(int)
4610STRINGLIB_CMP(const Py_UNICODE* str, const Py_UNICODE* other, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004611{
Thomas Wouters477c8d52006-05-27 19:21:47 +00004612 if (str[0] != other[0])
4613 return 1;
4614 return memcmp((void*) str, (void*) other, len * sizeof(Py_UNICODE));
Guido van Rossumd57fd912000-03-10 22:53:23 +00004615}
4616
Thomas Wouters477c8d52006-05-27 19:21:47 +00004617#define STRINGLIB_EMPTY unicode_empty
4618
4619#include "stringlib/fastsearch.h"
4620
4621#include "stringlib/count.h"
4622#include "stringlib/find.h"
4623#include "stringlib/partition.h"
4624
4625/* helper macro to fixup start/end slice values */
4626#define FIX_START_END(obj) \
4627 if (start < 0) \
4628 start += (obj)->length; \
4629 if (start < 0) \
4630 start = 0; \
4631 if (end > (obj)->length) \
4632 end = (obj)->length; \
4633 if (end < 0) \
4634 end += (obj)->length; \
4635 if (end < 0) \
4636 end = 0;
4637
Martin v. Löwis18e16552006-02-15 17:27:45 +00004638Py_ssize_t PyUnicode_Count(PyObject *str,
Thomas Wouters477c8d52006-05-27 19:21:47 +00004639 PyObject *substr,
4640 Py_ssize_t start,
4641 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004642{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004643 Py_ssize_t result;
Thomas Wouters477c8d52006-05-27 19:21:47 +00004644 PyUnicodeObject* str_obj;
4645 PyUnicodeObject* sub_obj;
Tim Petersced69f82003-09-16 20:30:58 +00004646
Thomas Wouters477c8d52006-05-27 19:21:47 +00004647 str_obj = (PyUnicodeObject*) PyUnicode_FromObject(str);
4648 if (!str_obj)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004649 return -1;
Thomas Wouters477c8d52006-05-27 19:21:47 +00004650 sub_obj = (PyUnicodeObject*) PyUnicode_FromObject(substr);
4651 if (!sub_obj) {
4652 Py_DECREF(str_obj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004653 return -1;
4654 }
Tim Petersced69f82003-09-16 20:30:58 +00004655
Thomas Wouters477c8d52006-05-27 19:21:47 +00004656 FIX_START_END(str_obj);
Tim Petersced69f82003-09-16 20:30:58 +00004657
Thomas Wouters477c8d52006-05-27 19:21:47 +00004658 result = stringlib_count(
4659 str_obj->str + start, end - start, sub_obj->str, sub_obj->length
4660 );
4661
4662 Py_DECREF(sub_obj);
4663 Py_DECREF(str_obj);
4664
Guido van Rossumd57fd912000-03-10 22:53:23 +00004665 return result;
4666}
4667
Martin v. Löwis18e16552006-02-15 17:27:45 +00004668Py_ssize_t PyUnicode_Find(PyObject *str,
Thomas Wouters477c8d52006-05-27 19:21:47 +00004669 PyObject *sub,
4670 Py_ssize_t start,
4671 Py_ssize_t end,
4672 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004673{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004674 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00004675
Guido van Rossumd57fd912000-03-10 22:53:23 +00004676 str = PyUnicode_FromObject(str);
Thomas Wouters477c8d52006-05-27 19:21:47 +00004677 if (!str)
Marc-André Lemburg4da6fd62002-05-29 11:33:13 +00004678 return -2;
Thomas Wouters477c8d52006-05-27 19:21:47 +00004679 sub = PyUnicode_FromObject(sub);
4680 if (!sub) {
Marc-André Lemburg41644392002-05-29 13:46:29 +00004681 Py_DECREF(str);
Marc-André Lemburg4da6fd62002-05-29 11:33:13 +00004682 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004683 }
Tim Petersced69f82003-09-16 20:30:58 +00004684
Thomas Wouters477c8d52006-05-27 19:21:47 +00004685 if (direction > 0)
4686 result = stringlib_find_slice(
4687 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
4688 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
4689 start, end
4690 );
4691 else
4692 result = stringlib_rfind_slice(
4693 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
4694 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
4695 start, end
4696 );
4697
Guido van Rossumd57fd912000-03-10 22:53:23 +00004698 Py_DECREF(str);
Thomas Wouters477c8d52006-05-27 19:21:47 +00004699 Py_DECREF(sub);
4700
Guido van Rossumd57fd912000-03-10 22:53:23 +00004701 return result;
4702}
4703
Tim Petersced69f82003-09-16 20:30:58 +00004704static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004705int tailmatch(PyUnicodeObject *self,
4706 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004707 Py_ssize_t start,
4708 Py_ssize_t end,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004709 int direction)
4710{
Guido van Rossumd57fd912000-03-10 22:53:23 +00004711 if (substring->length == 0)
4712 return 1;
4713
Thomas Wouters477c8d52006-05-27 19:21:47 +00004714 FIX_START_END(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004715
4716 end -= substring->length;
4717 if (end < start)
4718 return 0;
4719
4720 if (direction > 0) {
4721 if (Py_UNICODE_MATCH(self, end, substring))
4722 return 1;
4723 } else {
4724 if (Py_UNICODE_MATCH(self, start, substring))
4725 return 1;
4726 }
4727
4728 return 0;
4729}
4730
Martin v. Löwis18e16552006-02-15 17:27:45 +00004731Py_ssize_t PyUnicode_Tailmatch(PyObject *str,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004732 PyObject *substr,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004733 Py_ssize_t start,
4734 Py_ssize_t end,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004735 int direction)
4736{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004737 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00004738
Guido van Rossumd57fd912000-03-10 22:53:23 +00004739 str = PyUnicode_FromObject(str);
4740 if (str == NULL)
4741 return -1;
4742 substr = PyUnicode_FromObject(substr);
4743 if (substr == NULL) {
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00004744 Py_DECREF(str);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004745 return -1;
4746 }
Tim Petersced69f82003-09-16 20:30:58 +00004747
Guido van Rossumd57fd912000-03-10 22:53:23 +00004748 result = tailmatch((PyUnicodeObject *)str,
4749 (PyUnicodeObject *)substr,
4750 start, end, direction);
4751 Py_DECREF(str);
4752 Py_DECREF(substr);
4753 return result;
4754}
4755
Guido van Rossumd57fd912000-03-10 22:53:23 +00004756/* Apply fixfct filter to the Unicode object self and return a
4757 reference to the modified object */
4758
Tim Petersced69f82003-09-16 20:30:58 +00004759static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004760PyObject *fixup(PyUnicodeObject *self,
4761 int (*fixfct)(PyUnicodeObject *s))
4762{
4763
4764 PyUnicodeObject *u;
4765
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004766 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004767 if (u == NULL)
4768 return NULL;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004769
4770 Py_UNICODE_COPY(u->str, self->str, self->length);
4771
Tim Peters7a29bd52001-09-12 03:03:31 +00004772 if (!fixfct(u) && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004773 /* fixfct should return TRUE if it modified the buffer. If
4774 FALSE, return a reference to the original buffer instead
4775 (to save space, not time) */
4776 Py_INCREF(self);
4777 Py_DECREF(u);
4778 return (PyObject*) self;
4779 }
4780 return (PyObject*) u;
4781}
4782
Tim Petersced69f82003-09-16 20:30:58 +00004783static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004784int fixupper(PyUnicodeObject *self)
4785{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004786 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004787 Py_UNICODE *s = self->str;
4788 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00004789
Guido van Rossumd57fd912000-03-10 22:53:23 +00004790 while (len-- > 0) {
4791 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00004792
Guido van Rossumd57fd912000-03-10 22:53:23 +00004793 ch = Py_UNICODE_TOUPPER(*s);
4794 if (ch != *s) {
4795 status = 1;
4796 *s = ch;
4797 }
4798 s++;
4799 }
4800
4801 return status;
4802}
4803
Tim Petersced69f82003-09-16 20:30:58 +00004804static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004805int fixlower(PyUnicodeObject *self)
4806{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004807 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004808 Py_UNICODE *s = self->str;
4809 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00004810
Guido van Rossumd57fd912000-03-10 22:53:23 +00004811 while (len-- > 0) {
4812 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00004813
Guido van Rossumd57fd912000-03-10 22:53:23 +00004814 ch = Py_UNICODE_TOLOWER(*s);
4815 if (ch != *s) {
4816 status = 1;
4817 *s = ch;
4818 }
4819 s++;
4820 }
4821
4822 return status;
4823}
4824
Tim Petersced69f82003-09-16 20:30:58 +00004825static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004826int fixswapcase(PyUnicodeObject *self)
4827{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004828 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004829 Py_UNICODE *s = self->str;
4830 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00004831
Guido van Rossumd57fd912000-03-10 22:53:23 +00004832 while (len-- > 0) {
4833 if (Py_UNICODE_ISUPPER(*s)) {
4834 *s = Py_UNICODE_TOLOWER(*s);
4835 status = 1;
4836 } else if (Py_UNICODE_ISLOWER(*s)) {
4837 *s = Py_UNICODE_TOUPPER(*s);
4838 status = 1;
4839 }
4840 s++;
4841 }
4842
4843 return status;
4844}
4845
Tim Petersced69f82003-09-16 20:30:58 +00004846static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004847int fixcapitalize(PyUnicodeObject *self)
4848{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004849 Py_ssize_t len = self->length;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00004850 Py_UNICODE *s = self->str;
4851 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00004852
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00004853 if (len == 0)
4854 return 0;
4855 if (Py_UNICODE_ISLOWER(*s)) {
4856 *s = Py_UNICODE_TOUPPER(*s);
4857 status = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004858 }
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00004859 s++;
4860 while (--len > 0) {
4861 if (Py_UNICODE_ISUPPER(*s)) {
4862 *s = Py_UNICODE_TOLOWER(*s);
4863 status = 1;
4864 }
4865 s++;
4866 }
4867 return status;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004868}
4869
4870static
4871int fixtitle(PyUnicodeObject *self)
4872{
4873 register Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
4874 register Py_UNICODE *e;
4875 int previous_is_cased;
4876
4877 /* Shortcut for single character strings */
4878 if (PyUnicode_GET_SIZE(self) == 1) {
4879 Py_UNICODE ch = Py_UNICODE_TOTITLE(*p);
4880 if (*p != ch) {
4881 *p = ch;
4882 return 1;
4883 }
4884 else
4885 return 0;
4886 }
Tim Petersced69f82003-09-16 20:30:58 +00004887
Guido van Rossumd57fd912000-03-10 22:53:23 +00004888 e = p + PyUnicode_GET_SIZE(self);
4889 previous_is_cased = 0;
4890 for (; p < e; p++) {
4891 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00004892
Guido van Rossumd57fd912000-03-10 22:53:23 +00004893 if (previous_is_cased)
4894 *p = Py_UNICODE_TOLOWER(ch);
4895 else
4896 *p = Py_UNICODE_TOTITLE(ch);
Tim Petersced69f82003-09-16 20:30:58 +00004897
4898 if (Py_UNICODE_ISLOWER(ch) ||
4899 Py_UNICODE_ISUPPER(ch) ||
Guido van Rossumd57fd912000-03-10 22:53:23 +00004900 Py_UNICODE_ISTITLE(ch))
4901 previous_is_cased = 1;
4902 else
4903 previous_is_cased = 0;
4904 }
4905 return 1;
4906}
4907
Tim Peters8ce9f162004-08-27 01:49:32 +00004908PyObject *
4909PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004910{
Tim Peters8ce9f162004-08-27 01:49:32 +00004911 PyObject *internal_separator = NULL;
Skip Montanaro6543b452004-09-16 03:28:13 +00004912 const Py_UNICODE blank = ' ';
4913 const Py_UNICODE *sep = &blank;
Thomas Wouters477c8d52006-05-27 19:21:47 +00004914 Py_ssize_t seplen = 1;
Tim Peters05eba1f2004-08-27 21:32:02 +00004915 PyUnicodeObject *res = NULL; /* the result */
Thomas Wouters477c8d52006-05-27 19:21:47 +00004916 Py_ssize_t res_alloc = 100; /* # allocated bytes for string in res */
4917 Py_ssize_t res_used; /* # used bytes */
Tim Peters05eba1f2004-08-27 21:32:02 +00004918 Py_UNICODE *res_p; /* pointer to free byte in res's string area */
4919 PyObject *fseq; /* PySequence_Fast(seq) */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004920 Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */
Tim Peters8ce9f162004-08-27 01:49:32 +00004921 PyObject *item;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004922 Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004923
Tim Peters05eba1f2004-08-27 21:32:02 +00004924 fseq = PySequence_Fast(seq, "");
4925 if (fseq == NULL) {
Tim Peters05eba1f2004-08-27 21:32:02 +00004926 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +00004927 }
4928
Tim Peters91879ab2004-08-27 22:35:44 +00004929 /* Grrrr. A codec may be invoked to convert str objects to
4930 * Unicode, and so it's possible to call back into Python code
4931 * during PyUnicode_FromObject(), and so it's possible for a sick
4932 * codec to change the size of fseq (if seq is a list). Therefore
4933 * we have to keep refetching the size -- can't assume seqlen
4934 * is invariant.
4935 */
Tim Peters05eba1f2004-08-27 21:32:02 +00004936 seqlen = PySequence_Fast_GET_SIZE(fseq);
4937 /* If empty sequence, return u"". */
4938 if (seqlen == 0) {
4939 res = _PyUnicode_New(0); /* empty sequence; return u"" */
4940 goto Done;
4941 }
4942 /* If singleton sequence with an exact Unicode, return that. */
4943 if (seqlen == 1) {
4944 item = PySequence_Fast_GET_ITEM(fseq, 0);
4945 if (PyUnicode_CheckExact(item)) {
4946 Py_INCREF(item);
4947 res = (PyUnicodeObject *)item;
4948 goto Done;
4949 }
Tim Peters8ce9f162004-08-27 01:49:32 +00004950 }
4951
Tim Peters05eba1f2004-08-27 21:32:02 +00004952 /* At least two items to join, or one that isn't exact Unicode. */
4953 if (seqlen > 1) {
4954 /* Set up sep and seplen -- they're needed. */
4955 if (separator == NULL) {
4956 sep = &blank;
4957 seplen = 1;
4958 }
4959 else {
4960 internal_separator = PyUnicode_FromObject(separator);
4961 if (internal_separator == NULL)
4962 goto onError;
4963 sep = PyUnicode_AS_UNICODE(internal_separator);
4964 seplen = PyUnicode_GET_SIZE(internal_separator);
Tim Peters91879ab2004-08-27 22:35:44 +00004965 /* In case PyUnicode_FromObject() mutated seq. */
4966 seqlen = PySequence_Fast_GET_SIZE(fseq);
Tim Peters05eba1f2004-08-27 21:32:02 +00004967 }
4968 }
4969
4970 /* Get space. */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004971 res = _PyUnicode_New(res_alloc);
Tim Peters05eba1f2004-08-27 21:32:02 +00004972 if (res == NULL)
Tim Peters8ce9f162004-08-27 01:49:32 +00004973 goto onError;
Tim Peters05eba1f2004-08-27 21:32:02 +00004974 res_p = PyUnicode_AS_UNICODE(res);
4975 res_used = 0;
Tim Peters8ce9f162004-08-27 01:49:32 +00004976
Tim Peters05eba1f2004-08-27 21:32:02 +00004977 for (i = 0; i < seqlen; ++i) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00004978 Py_ssize_t itemlen;
4979 Py_ssize_t new_res_used;
Tim Peters05eba1f2004-08-27 21:32:02 +00004980
4981 item = PySequence_Fast_GET_ITEM(fseq, i);
4982 /* Convert item to Unicode. */
4983 if (! PyUnicode_Check(item) && ! PyString_Check(item)) {
4984 PyErr_Format(PyExc_TypeError,
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004985 "sequence item %zd: expected string or Unicode,"
Tim Peters05eba1f2004-08-27 21:32:02 +00004986 " %.80s found",
4987 i, item->ob_type->tp_name);
Tim Peters2cfe3682001-05-05 05:36:48 +00004988 goto onError;
Tim Peters8ce9f162004-08-27 01:49:32 +00004989 }
Tim Peters05eba1f2004-08-27 21:32:02 +00004990 item = PyUnicode_FromObject(item);
4991 if (item == NULL)
4992 goto onError;
4993 /* We own a reference to item from here on. */
Tim Petersced69f82003-09-16 20:30:58 +00004994
Tim Peters91879ab2004-08-27 22:35:44 +00004995 /* In case PyUnicode_FromObject() mutated seq. */
4996 seqlen = PySequence_Fast_GET_SIZE(fseq);
4997
Tim Peters8ce9f162004-08-27 01:49:32 +00004998 /* Make sure we have enough space for the separator and the item. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004999 itemlen = PyUnicode_GET_SIZE(item);
Tim Peters05eba1f2004-08-27 21:32:02 +00005000 new_res_used = res_used + itemlen;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005001 if (new_res_used < 0)
Tim Peters8ce9f162004-08-27 01:49:32 +00005002 goto Overflow;
Tim Peters05eba1f2004-08-27 21:32:02 +00005003 if (i < seqlen - 1) {
5004 new_res_used += seplen;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005005 if (new_res_used < 0)
Tim Peters05eba1f2004-08-27 21:32:02 +00005006 goto Overflow;
5007 }
5008 if (new_res_used > res_alloc) {
5009 /* double allocated size until it's big enough */
Tim Peters8ce9f162004-08-27 01:49:32 +00005010 do {
Tim Peters05eba1f2004-08-27 21:32:02 +00005011 res_alloc += res_alloc;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005012 if (res_alloc <= 0)
Tim Peters8ce9f162004-08-27 01:49:32 +00005013 goto Overflow;
Tim Peters05eba1f2004-08-27 21:32:02 +00005014 } while (new_res_used > res_alloc);
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005015 if (_PyUnicode_Resize(&res, res_alloc) < 0) {
Marc-André Lemburg3508e302001-09-20 17:22:58 +00005016 Py_DECREF(item);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005017 goto onError;
Marc-André Lemburg3508e302001-09-20 17:22:58 +00005018 }
Tim Peters05eba1f2004-08-27 21:32:02 +00005019 res_p = PyUnicode_AS_UNICODE(res) + res_used;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005020 }
Tim Peters05eba1f2004-08-27 21:32:02 +00005021
5022 /* Copy item, and maybe the separator. */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005023 Py_UNICODE_COPY(res_p, PyUnicode_AS_UNICODE(item), itemlen);
Tim Peters05eba1f2004-08-27 21:32:02 +00005024 res_p += itemlen;
5025 if (i < seqlen - 1) {
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005026 Py_UNICODE_COPY(res_p, sep, seplen);
Tim Peters05eba1f2004-08-27 21:32:02 +00005027 res_p += seplen;
5028 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005029 Py_DECREF(item);
Tim Peters05eba1f2004-08-27 21:32:02 +00005030 res_used = new_res_used;
5031 }
Tim Peters8ce9f162004-08-27 01:49:32 +00005032
Tim Peters05eba1f2004-08-27 21:32:02 +00005033 /* Shrink res to match the used area; this probably can't fail,
5034 * but it's cheap to check.
5035 */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005036 if (_PyUnicode_Resize(&res, res_used) < 0)
Tim Peters8ce9f162004-08-27 01:49:32 +00005037 goto onError;
5038
5039 Done:
5040 Py_XDECREF(internal_separator);
Tim Peters05eba1f2004-08-27 21:32:02 +00005041 Py_DECREF(fseq);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005042 return (PyObject *)res;
5043
Tim Peters8ce9f162004-08-27 01:49:32 +00005044 Overflow:
5045 PyErr_SetString(PyExc_OverflowError,
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005046 "join() result is too long for a Python string");
Tim Peters8ce9f162004-08-27 01:49:32 +00005047 Py_DECREF(item);
5048 /* fall through */
5049
Guido van Rossumd57fd912000-03-10 22:53:23 +00005050 onError:
Tim Peters8ce9f162004-08-27 01:49:32 +00005051 Py_XDECREF(internal_separator);
Tim Peters05eba1f2004-08-27 21:32:02 +00005052 Py_DECREF(fseq);
Tim Peters8ce9f162004-08-27 01:49:32 +00005053 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005054 return NULL;
5055}
5056
Tim Petersced69f82003-09-16 20:30:58 +00005057static
5058PyUnicodeObject *pad(PyUnicodeObject *self,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005059 Py_ssize_t left,
5060 Py_ssize_t right,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005061 Py_UNICODE fill)
5062{
5063 PyUnicodeObject *u;
5064
5065 if (left < 0)
5066 left = 0;
5067 if (right < 0)
5068 right = 0;
5069
Tim Peters7a29bd52001-09-12 03:03:31 +00005070 if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005071 Py_INCREF(self);
5072 return self;
5073 }
5074
5075 u = _PyUnicode_New(left + self->length + right);
5076 if (u) {
5077 if (left)
5078 Py_UNICODE_FILL(u->str, fill, left);
5079 Py_UNICODE_COPY(u->str + left, self->str, self->length);
5080 if (right)
5081 Py_UNICODE_FILL(u->str + left + self->length, fill, right);
5082 }
5083
5084 return u;
5085}
5086
5087#define SPLIT_APPEND(data, left, right) \
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005088 str = PyUnicode_FromUnicode((data) + (left), (right) - (left)); \
Guido van Rossumd57fd912000-03-10 22:53:23 +00005089 if (!str) \
5090 goto onError; \
5091 if (PyList_Append(list, str)) { \
5092 Py_DECREF(str); \
5093 goto onError; \
5094 } \
5095 else \
5096 Py_DECREF(str);
5097
5098static
5099PyObject *split_whitespace(PyUnicodeObject *self,
5100 PyObject *list,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005101 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005102{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005103 register Py_ssize_t i;
5104 register Py_ssize_t j;
5105 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005106 PyObject *str;
5107
5108 for (i = j = 0; i < len; ) {
5109 /* find a token */
5110 while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
5111 i++;
5112 j = i;
5113 while (i < len && !Py_UNICODE_ISSPACE(self->str[i]))
5114 i++;
5115 if (j < i) {
5116 if (maxcount-- <= 0)
5117 break;
5118 SPLIT_APPEND(self->str, j, i);
5119 while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
5120 i++;
5121 j = i;
5122 }
5123 }
5124 if (j < len) {
5125 SPLIT_APPEND(self->str, j, len);
5126 }
5127 return list;
5128
5129 onError:
5130 Py_DECREF(list);
5131 return NULL;
5132}
5133
5134PyObject *PyUnicode_Splitlines(PyObject *string,
Guido van Rossum86662912000-04-11 15:38:46 +00005135 int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005136{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005137 register Py_ssize_t i;
5138 register Py_ssize_t j;
5139 Py_ssize_t len;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005140 PyObject *list;
5141 PyObject *str;
5142 Py_UNICODE *data;
5143
5144 string = PyUnicode_FromObject(string);
5145 if (string == NULL)
5146 return NULL;
5147 data = PyUnicode_AS_UNICODE(string);
5148 len = PyUnicode_GET_SIZE(string);
5149
Guido van Rossumd57fd912000-03-10 22:53:23 +00005150 list = PyList_New(0);
5151 if (!list)
5152 goto onError;
5153
5154 for (i = j = 0; i < len; ) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00005155 Py_ssize_t eol;
Tim Petersced69f82003-09-16 20:30:58 +00005156
Guido van Rossumd57fd912000-03-10 22:53:23 +00005157 /* Find a line and append it */
Thomas Wouters477c8d52006-05-27 19:21:47 +00005158 while (i < len && !BLOOM_LINEBREAK(data[i]))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005159 i++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005160
5161 /* Skip the line break reading CRLF as one line break */
Guido van Rossum86662912000-04-11 15:38:46 +00005162 eol = i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005163 if (i < len) {
5164 if (data[i] == '\r' && i + 1 < len &&
5165 data[i+1] == '\n')
5166 i += 2;
5167 else
5168 i++;
Guido van Rossum86662912000-04-11 15:38:46 +00005169 if (keepends)
5170 eol = i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005171 }
Guido van Rossum86662912000-04-11 15:38:46 +00005172 SPLIT_APPEND(data, j, eol);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005173 j = i;
5174 }
5175 if (j < len) {
5176 SPLIT_APPEND(data, j, len);
5177 }
5178
5179 Py_DECREF(string);
5180 return list;
5181
5182 onError:
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00005183 Py_XDECREF(list);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005184 Py_DECREF(string);
5185 return NULL;
5186}
5187
Tim Petersced69f82003-09-16 20:30:58 +00005188static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005189PyObject *split_char(PyUnicodeObject *self,
5190 PyObject *list,
5191 Py_UNICODE ch,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005192 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005193{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005194 register Py_ssize_t i;
5195 register Py_ssize_t j;
5196 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005197 PyObject *str;
5198
5199 for (i = j = 0; i < len; ) {
5200 if (self->str[i] == ch) {
5201 if (maxcount-- <= 0)
5202 break;
5203 SPLIT_APPEND(self->str, j, i);
5204 i = j = i + 1;
5205 } else
5206 i++;
5207 }
5208 if (j <= len) {
5209 SPLIT_APPEND(self->str, j, len);
5210 }
5211 return list;
5212
5213 onError:
5214 Py_DECREF(list);
5215 return NULL;
5216}
5217
Tim Petersced69f82003-09-16 20:30:58 +00005218static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005219PyObject *split_substring(PyUnicodeObject *self,
5220 PyObject *list,
5221 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005222 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005223{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005224 register Py_ssize_t i;
5225 register Py_ssize_t j;
5226 Py_ssize_t len = self->length;
5227 Py_ssize_t sublen = substring->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005228 PyObject *str;
5229
Guido van Rossumcda4f9a2000-12-19 02:23:19 +00005230 for (i = j = 0; i <= len - sublen; ) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005231 if (Py_UNICODE_MATCH(self, i, substring)) {
5232 if (maxcount-- <= 0)
5233 break;
5234 SPLIT_APPEND(self->str, j, i);
5235 i = j = i + sublen;
5236 } else
5237 i++;
5238 }
5239 if (j <= len) {
5240 SPLIT_APPEND(self->str, j, len);
5241 }
5242 return list;
5243
5244 onError:
5245 Py_DECREF(list);
5246 return NULL;
5247}
5248
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005249static
5250PyObject *rsplit_whitespace(PyUnicodeObject *self,
5251 PyObject *list,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005252 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005253{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005254 register Py_ssize_t i;
5255 register Py_ssize_t j;
5256 Py_ssize_t len = self->length;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005257 PyObject *str;
5258
5259 for (i = j = len - 1; i >= 0; ) {
5260 /* find a token */
5261 while (i >= 0 && Py_UNICODE_ISSPACE(self->str[i]))
5262 i--;
5263 j = i;
5264 while (i >= 0 && !Py_UNICODE_ISSPACE(self->str[i]))
5265 i--;
5266 if (j > i) {
5267 if (maxcount-- <= 0)
5268 break;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005269 SPLIT_APPEND(self->str, i + 1, j + 1);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005270 while (i >= 0 && Py_UNICODE_ISSPACE(self->str[i]))
5271 i--;
5272 j = i;
5273 }
5274 }
5275 if (j >= 0) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00005276 SPLIT_APPEND(self->str, 0, j + 1);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005277 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00005278 if (PyList_Reverse(list) < 0)
5279 goto onError;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005280 return list;
5281
5282 onError:
5283 Py_DECREF(list);
5284 return NULL;
5285}
5286
5287static
5288PyObject *rsplit_char(PyUnicodeObject *self,
5289 PyObject *list,
5290 Py_UNICODE ch,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005291 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005292{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005293 register Py_ssize_t i;
5294 register Py_ssize_t j;
5295 Py_ssize_t len = self->length;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005296 PyObject *str;
5297
5298 for (i = j = len - 1; i >= 0; ) {
5299 if (self->str[i] == ch) {
5300 if (maxcount-- <= 0)
5301 break;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005302 SPLIT_APPEND(self->str, i + 1, j + 1);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005303 j = i = i - 1;
5304 } else
5305 i--;
5306 }
Hye-Shik Chang7fc4cf52003-12-23 09:10:16 +00005307 if (j >= -1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00005308 SPLIT_APPEND(self->str, 0, j + 1);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005309 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00005310 if (PyList_Reverse(list) < 0)
5311 goto onError;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005312 return list;
5313
5314 onError:
5315 Py_DECREF(list);
5316 return NULL;
5317}
5318
5319static
5320PyObject *rsplit_substring(PyUnicodeObject *self,
5321 PyObject *list,
5322 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005323 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005324{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005325 register Py_ssize_t i;
5326 register Py_ssize_t j;
5327 Py_ssize_t len = self->length;
5328 Py_ssize_t sublen = substring->length;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005329 PyObject *str;
5330
5331 for (i = len - sublen, j = len; i >= 0; ) {
5332 if (Py_UNICODE_MATCH(self, i, substring)) {
5333 if (maxcount-- <= 0)
5334 break;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005335 SPLIT_APPEND(self->str, i + sublen, j);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005336 j = i;
5337 i -= sublen;
5338 } else
5339 i--;
5340 }
5341 if (j >= 0) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00005342 SPLIT_APPEND(self->str, 0, j);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005343 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00005344 if (PyList_Reverse(list) < 0)
5345 goto onError;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005346 return list;
5347
5348 onError:
5349 Py_DECREF(list);
5350 return NULL;
5351}
5352
Guido van Rossumd57fd912000-03-10 22:53:23 +00005353#undef SPLIT_APPEND
5354
5355static
5356PyObject *split(PyUnicodeObject *self,
5357 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005358 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005359{
5360 PyObject *list;
5361
5362 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005363 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005364
5365 list = PyList_New(0);
5366 if (!list)
5367 return NULL;
5368
5369 if (substring == NULL)
5370 return split_whitespace(self,list,maxcount);
5371
5372 else if (substring->length == 1)
5373 return split_char(self,list,substring->str[0],maxcount);
5374
5375 else if (substring->length == 0) {
5376 Py_DECREF(list);
5377 PyErr_SetString(PyExc_ValueError, "empty separator");
5378 return NULL;
5379 }
5380 else
5381 return split_substring(self,list,substring,maxcount);
5382}
5383
Tim Petersced69f82003-09-16 20:30:58 +00005384static
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005385PyObject *rsplit(PyUnicodeObject *self,
5386 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005387 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005388{
5389 PyObject *list;
5390
5391 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005392 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005393
5394 list = PyList_New(0);
5395 if (!list)
5396 return NULL;
5397
5398 if (substring == NULL)
5399 return rsplit_whitespace(self,list,maxcount);
5400
5401 else if (substring->length == 1)
5402 return rsplit_char(self,list,substring->str[0],maxcount);
5403
5404 else if (substring->length == 0) {
5405 Py_DECREF(list);
5406 PyErr_SetString(PyExc_ValueError, "empty separator");
5407 return NULL;
5408 }
5409 else
5410 return rsplit_substring(self,list,substring,maxcount);
5411}
5412
5413static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005414PyObject *replace(PyUnicodeObject *self,
5415 PyUnicodeObject *str1,
5416 PyUnicodeObject *str2,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005417 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005418{
5419 PyUnicodeObject *u;
5420
5421 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005422 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005423
Thomas Wouters477c8d52006-05-27 19:21:47 +00005424 if (str1->length == str2->length) {
5425 /* same length */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005426 Py_ssize_t i;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005427 if (str1->length == 1) {
5428 /* replace characters */
5429 Py_UNICODE u1, u2;
5430 if (!findchar(self->str, self->length, str1->str[0]))
5431 goto nothing;
5432 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
5433 if (!u)
5434 return NULL;
5435 Py_UNICODE_COPY(u->str, self->str, self->length);
5436 u1 = str1->str[0];
5437 u2 = str2->str[0];
5438 for (i = 0; i < u->length; i++)
5439 if (u->str[i] == u1) {
5440 if (--maxcount < 0)
5441 break;
5442 u->str[i] = u2;
5443 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005444 } else {
Thomas Wouters477c8d52006-05-27 19:21:47 +00005445 i = fastsearch(
5446 self->str, self->length, str1->str, str1->length, FAST_SEARCH
Guido van Rossumd57fd912000-03-10 22:53:23 +00005447 );
Thomas Wouters477c8d52006-05-27 19:21:47 +00005448 if (i < 0)
5449 goto nothing;
5450 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
5451 if (!u)
5452 return NULL;
5453 Py_UNICODE_COPY(u->str, self->str, self->length);
5454 while (i <= self->length - str1->length)
5455 if (Py_UNICODE_MATCH(self, i, str1)) {
5456 if (--maxcount < 0)
5457 break;
5458 Py_UNICODE_COPY(u->str+i, str2->str, str2->length);
5459 i += str1->length;
5460 } else
5461 i++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005462 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005463 } else {
Thomas Wouters477c8d52006-05-27 19:21:47 +00005464
5465 Py_ssize_t n, i, j, e;
5466 Py_ssize_t product, new_size, delta;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005467 Py_UNICODE *p;
5468
5469 /* replace strings */
Thomas Wouters477c8d52006-05-27 19:21:47 +00005470 n = stringlib_count(self->str, self->length, str1->str, str1->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005471 if (n > maxcount)
5472 n = maxcount;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005473 if (n == 0)
5474 goto nothing;
5475 /* new_size = self->length + n * (str2->length - str1->length)); */
5476 delta = (str2->length - str1->length);
5477 if (delta == 0) {
5478 new_size = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005479 } else {
Thomas Wouters477c8d52006-05-27 19:21:47 +00005480 product = n * (str2->length - str1->length);
5481 if ((product / (str2->length - str1->length)) != n) {
5482 PyErr_SetString(PyExc_OverflowError,
5483 "replace string is too long");
5484 return NULL;
5485 }
5486 new_size = self->length + product;
5487 if (new_size < 0) {
5488 PyErr_SetString(PyExc_OverflowError,
5489 "replace string is too long");
5490 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005491 }
5492 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00005493 u = _PyUnicode_New(new_size);
5494 if (!u)
5495 return NULL;
5496 i = 0;
5497 p = u->str;
5498 e = self->length - str1->length;
5499 if (str1->length > 0) {
5500 while (n-- > 0) {
5501 /* look for next match */
5502 j = i;
5503 while (j <= e) {
5504 if (Py_UNICODE_MATCH(self, j, str1))
5505 break;
5506 j++;
5507 }
5508 if (j > i) {
5509 if (j > e)
5510 break;
5511 /* copy unchanged part [i:j] */
5512 Py_UNICODE_COPY(p, self->str+i, j-i);
5513 p += j - i;
5514 }
5515 /* copy substitution string */
5516 if (str2->length > 0) {
5517 Py_UNICODE_COPY(p, str2->str, str2->length);
5518 p += str2->length;
5519 }
5520 i = j + str1->length;
5521 }
5522 if (i < self->length)
5523 /* copy tail [i:] */
5524 Py_UNICODE_COPY(p, self->str+i, self->length-i);
5525 } else {
5526 /* interleave */
5527 while (n > 0) {
5528 Py_UNICODE_COPY(p, str2->str, str2->length);
5529 p += str2->length;
5530 if (--n <= 0)
5531 break;
5532 *p++ = self->str[i++];
5533 }
5534 Py_UNICODE_COPY(p, self->str+i, self->length-i);
5535 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005536 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005537 return (PyObject *) u;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005538
5539nothing:
5540 /* nothing to replace; return original string (when possible) */
5541 if (PyUnicode_CheckExact(self)) {
5542 Py_INCREF(self);
5543 return (PyObject *) self;
5544 }
5545 return PyUnicode_FromUnicode(self->str, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005546}
5547
5548/* --- Unicode Object Methods --------------------------------------------- */
5549
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005550PyDoc_STRVAR(title__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005551"S.title() -> unicode\n\
5552\n\
5553Return a titlecased version of S, i.e. words start with title case\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005554characters, all remaining cased characters have lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005555
5556static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005557unicode_title(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005558{
Guido van Rossumd57fd912000-03-10 22:53:23 +00005559 return fixup(self, fixtitle);
5560}
5561
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005562PyDoc_STRVAR(capitalize__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005563"S.capitalize() -> unicode\n\
5564\n\
5565Return a capitalized version of S, i.e. make the first character\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005566have upper case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005567
5568static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005569unicode_capitalize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005570{
Guido van Rossumd57fd912000-03-10 22:53:23 +00005571 return fixup(self, fixcapitalize);
5572}
5573
5574#if 0
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005575PyDoc_STRVAR(capwords__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005576"S.capwords() -> unicode\n\
5577\n\
5578Apply .capitalize() to all words in S and return the result with\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005579normalized whitespace (all whitespace strings are replaced by ' ').");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005580
5581static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005582unicode_capwords(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005583{
5584 PyObject *list;
5585 PyObject *item;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005586 Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005587
Guido van Rossumd57fd912000-03-10 22:53:23 +00005588 /* Split into words */
5589 list = split(self, NULL, -1);
5590 if (!list)
5591 return NULL;
5592
5593 /* Capitalize each word */
5594 for (i = 0; i < PyList_GET_SIZE(list); i++) {
5595 item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
5596 fixcapitalize);
5597 if (item == NULL)
5598 goto onError;
5599 Py_DECREF(PyList_GET_ITEM(list, i));
5600 PyList_SET_ITEM(list, i, item);
5601 }
5602
5603 /* Join the words to form a new string */
5604 item = PyUnicode_Join(NULL, list);
5605
5606onError:
5607 Py_DECREF(list);
5608 return (PyObject *)item;
5609}
5610#endif
5611
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00005612/* Argument converter. Coerces to a single unicode character */
5613
5614static int
5615convert_uc(PyObject *obj, void *addr)
5616{
5617 Py_UNICODE *fillcharloc = (Py_UNICODE *)addr;
5618 PyObject *uniobj;
5619 Py_UNICODE *unistr;
5620
5621 uniobj = PyUnicode_FromObject(obj);
5622 if (uniobj == NULL) {
5623 PyErr_SetString(PyExc_TypeError,
5624 "The fill character cannot be converted to Unicode");
5625 return 0;
5626 }
5627 if (PyUnicode_GET_SIZE(uniobj) != 1) {
5628 PyErr_SetString(PyExc_TypeError,
5629 "The fill character must be exactly one character long");
5630 Py_DECREF(uniobj);
5631 return 0;
5632 }
5633 unistr = PyUnicode_AS_UNICODE(uniobj);
5634 *fillcharloc = unistr[0];
5635 Py_DECREF(uniobj);
5636 return 1;
5637}
5638
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005639PyDoc_STRVAR(center__doc__,
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00005640"S.center(width[, fillchar]) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005641\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00005642Return S centered in a Unicode string of length width. Padding is\n\
5643done using the specified fill character (default is a space)");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005644
5645static PyObject *
5646unicode_center(PyUnicodeObject *self, PyObject *args)
5647{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005648 Py_ssize_t marg, left;
5649 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00005650 Py_UNICODE fillchar = ' ';
Guido van Rossumd57fd912000-03-10 22:53:23 +00005651
Thomas Woutersde017742006-02-16 19:34:37 +00005652 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005653 return NULL;
5654
Tim Peters7a29bd52001-09-12 03:03:31 +00005655 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005656 Py_INCREF(self);
5657 return (PyObject*) self;
5658 }
5659
5660 marg = width - self->length;
5661 left = marg / 2 + (marg & width & 1);
5662
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00005663 return (PyObject*) pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005664}
5665
Marc-André Lemburge5034372000-08-08 08:04:29 +00005666#if 0
5667
5668/* This code should go into some future Unicode collation support
5669 module. The basic comparison should compare ordinals on a naive
Trent Mick20abf572000-08-12 22:14:34 +00005670 basis (this is what Java does and thus JPython too). */
Marc-André Lemburge5034372000-08-08 08:04:29 +00005671
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005672/* speedy UTF-16 code point order comparison */
5673/* gleaned from: */
5674/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
5675
Marc-André Lemburge12896e2000-07-07 17:51:08 +00005676static short utf16Fixup[32] =
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005677{
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005678 0, 0, 0, 0, 0, 0, 0, 0,
Tim Petersced69f82003-09-16 20:30:58 +00005679 0, 0, 0, 0, 0, 0, 0, 0,
5680 0, 0, 0, 0, 0, 0, 0, 0,
Marc-André Lemburge12896e2000-07-07 17:51:08 +00005681 0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005682};
5683
Guido van Rossumd57fd912000-03-10 22:53:23 +00005684static int
5685unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
5686{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005687 Py_ssize_t len1, len2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005688
Guido van Rossumd57fd912000-03-10 22:53:23 +00005689 Py_UNICODE *s1 = str1->str;
5690 Py_UNICODE *s2 = str2->str;
5691
5692 len1 = str1->length;
5693 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00005694
Guido van Rossumd57fd912000-03-10 22:53:23 +00005695 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00005696 Py_UNICODE c1, c2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005697
5698 c1 = *s1++;
5699 c2 = *s2++;
Fredrik Lundh45714e92001-06-26 16:39:36 +00005700
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005701 if (c1 > (1<<11) * 26)
5702 c1 += utf16Fixup[c1>>11];
5703 if (c2 > (1<<11) * 26)
5704 c2 += utf16Fixup[c2>>11];
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005705 /* now c1 and c2 are in UTF-32-compatible order */
Fredrik Lundh45714e92001-06-26 16:39:36 +00005706
5707 if (c1 != c2)
5708 return (c1 < c2) ? -1 : 1;
Tim Petersced69f82003-09-16 20:30:58 +00005709
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005710 len1--; len2--;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005711 }
5712
5713 return (len1 < len2) ? -1 : (len1 != len2);
5714}
5715
Marc-André Lemburge5034372000-08-08 08:04:29 +00005716#else
5717
5718static int
5719unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
5720{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005721 register Py_ssize_t len1, len2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00005722
5723 Py_UNICODE *s1 = str1->str;
5724 Py_UNICODE *s2 = str2->str;
5725
5726 len1 = str1->length;
5727 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00005728
Marc-André Lemburge5034372000-08-08 08:04:29 +00005729 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00005730 Py_UNICODE c1, c2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00005731
Fredrik Lundh45714e92001-06-26 16:39:36 +00005732 c1 = *s1++;
5733 c2 = *s2++;
5734
5735 if (c1 != c2)
5736 return (c1 < c2) ? -1 : 1;
5737
Marc-André Lemburge5034372000-08-08 08:04:29 +00005738 len1--; len2--;
5739 }
5740
5741 return (len1 < len2) ? -1 : (len1 != len2);
5742}
5743
5744#endif
5745
Guido van Rossumd57fd912000-03-10 22:53:23 +00005746int PyUnicode_Compare(PyObject *left,
5747 PyObject *right)
5748{
Guido van Rossum09dc34f2007-05-04 04:17:33 +00005749 if (PyUnicode_Check(left) && PyUnicode_Check(right))
5750 return unicode_compare((PyUnicodeObject *)left,
5751 (PyUnicodeObject *)right);
5752 if ((PyString_Check(left) && PyUnicode_Check(right)) ||
5753 (PyUnicode_Check(left) && PyString_Check(right))) {
5754 if (PyUnicode_Check(left))
5755 left = _PyUnicode_AsDefaultEncodedString(left, NULL);
5756 if (PyUnicode_Check(right))
5757 right = _PyUnicode_AsDefaultEncodedString(right, NULL);
5758 assert(PyString_Check(left));
5759 assert(PyString_Check(right));
5760 return PyObject_Compare(left, right);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005761 }
Guido van Rossum09dc34f2007-05-04 04:17:33 +00005762 PyErr_Format(PyExc_TypeError,
5763 "Can't compare %.100s and %.100s",
5764 left->ob_type->tp_name,
5765 right->ob_type->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005766 return -1;
5767}
5768
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00005769PyObject *PyUnicode_RichCompare(PyObject *left,
5770 PyObject *right,
5771 int op)
5772{
5773 int result;
5774
5775 result = PyUnicode_Compare(left, right);
5776 if (result == -1 && PyErr_Occurred())
5777 goto onError;
5778
5779 /* Convert the return value to a Boolean */
5780 switch (op) {
5781 case Py_EQ:
5782 result = (result == 0);
5783 break;
5784 case Py_NE:
5785 result = (result != 0);
5786 break;
5787 case Py_LE:
5788 result = (result <= 0);
5789 break;
5790 case Py_GE:
5791 result = (result >= 0);
5792 break;
5793 case Py_LT:
5794 result = (result == -1);
5795 break;
5796 case Py_GT:
5797 result = (result == 1);
5798 break;
5799 }
5800 return PyBool_FromLong(result);
5801
5802 onError:
5803
5804 /* Standard case
5805
5806 Type errors mean that PyUnicode_FromObject() could not convert
5807 one of the arguments (usually the right hand side) to Unicode,
5808 ie. we can't handle the comparison request. However, it is
5809 possible that the other object knows a comparison method, which
5810 is why we return Py_NotImplemented to give the other object a
5811 chance.
5812
5813 */
5814 if (PyErr_ExceptionMatches(PyExc_TypeError)) {
5815 PyErr_Clear();
5816 Py_INCREF(Py_NotImplemented);
5817 return Py_NotImplemented;
5818 }
5819 if (op != Py_EQ && op != Py_NE)
5820 return NULL;
5821
5822 /* Equality comparison.
5823
5824 This is a special case: we silence any PyExc_UnicodeDecodeError
5825 and instead turn it into a PyErr_UnicodeWarning.
5826
5827 */
5828 if (!PyErr_ExceptionMatches(PyExc_UnicodeDecodeError))
5829 return NULL;
5830 PyErr_Clear();
5831 if (PyErr_Warn(PyExc_UnicodeWarning,
5832 (op == Py_EQ) ?
5833 "Unicode equal comparison "
5834 "failed to convert both arguments to Unicode - "
5835 "interpreting them as being unequal" :
5836 "Unicode unequal comparison "
5837 "failed to convert both arguments to Unicode - "
5838 "interpreting them as being unequal"
5839 ) < 0)
5840 return NULL;
5841 result = (op == Py_NE);
5842 return PyBool_FromLong(result);
5843}
5844
Guido van Rossum403d68b2000-03-13 15:55:09 +00005845int PyUnicode_Contains(PyObject *container,
5846 PyObject *element)
5847{
Thomas Wouters477c8d52006-05-27 19:21:47 +00005848 PyObject *str, *sub;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005849 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00005850
5851 /* Coerce the two arguments */
Thomas Wouters477c8d52006-05-27 19:21:47 +00005852 sub = PyUnicode_FromObject(element);
5853 if (!sub) {
Marc-André Lemburg7c014682000-06-28 08:11:47 +00005854 PyErr_SetString(PyExc_TypeError,
Barry Warsaw817918c2002-08-06 16:58:21 +00005855 "'in <string>' requires string as left operand");
Thomas Wouters477c8d52006-05-27 19:21:47 +00005856 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +00005857 }
5858
Thomas Wouters477c8d52006-05-27 19:21:47 +00005859 str = PyUnicode_FromObject(container);
5860 if (!str) {
5861 Py_DECREF(sub);
5862 return -1;
5863 }
5864
5865 result = stringlib_contains_obj(str, sub);
5866
5867 Py_DECREF(str);
5868 Py_DECREF(sub);
5869
Guido van Rossum403d68b2000-03-13 15:55:09 +00005870 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00005871}
5872
Guido van Rossumd57fd912000-03-10 22:53:23 +00005873/* Concat to string or Unicode object giving a new Unicode object. */
5874
5875PyObject *PyUnicode_Concat(PyObject *left,
5876 PyObject *right)
5877{
5878 PyUnicodeObject *u = NULL, *v = NULL, *w;
5879
Guido van Rossum84d79dd2007-04-13 02:23:57 +00005880 if (PyBytes_Check(left) || PyBytes_Check(right))
5881 return PyBytes_Concat(left, right);
5882
Guido van Rossumd57fd912000-03-10 22:53:23 +00005883 /* Coerce the two arguments */
5884 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
5885 if (u == NULL)
5886 goto onError;
5887 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
5888 if (v == NULL)
5889 goto onError;
5890
5891 /* Shortcuts */
5892 if (v == unicode_empty) {
5893 Py_DECREF(v);
5894 return (PyObject *)u;
5895 }
5896 if (u == unicode_empty) {
5897 Py_DECREF(u);
5898 return (PyObject *)v;
5899 }
5900
5901 /* Concat the two Unicode strings */
5902 w = _PyUnicode_New(u->length + v->length);
5903 if (w == NULL)
5904 goto onError;
5905 Py_UNICODE_COPY(w->str, u->str, u->length);
5906 Py_UNICODE_COPY(w->str + u->length, v->str, v->length);
5907
5908 Py_DECREF(u);
5909 Py_DECREF(v);
5910 return (PyObject *)w;
5911
5912onError:
5913 Py_XDECREF(u);
5914 Py_XDECREF(v);
5915 return NULL;
5916}
5917
Walter Dörwald1ab83302007-05-18 17:15:44 +00005918void
5919PyUnicode_Append(PyObject **pleft, PyObject *right)
5920{
5921 PyObject *new;
5922 if (*pleft == NULL)
5923 return;
5924 if (right == NULL || !PyUnicode_Check(*pleft)) {
5925 Py_DECREF(*pleft);
5926 *pleft = NULL;
5927 return;
5928 }
5929 new = PyUnicode_Concat(*pleft, right);
5930 Py_DECREF(*pleft);
5931 *pleft = new;
5932}
5933
5934void
5935PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
5936{
5937 PyUnicode_Append(pleft, right);
5938 Py_XDECREF(right);
5939}
5940
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005941PyDoc_STRVAR(count__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005942"S.count(sub[, start[, end]]) -> int\n\
5943\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00005944Return the number of non-overlapping occurrences of substring sub in\n\
5945Unicode string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005946interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005947
5948static PyObject *
5949unicode_count(PyUnicodeObject *self, PyObject *args)
5950{
5951 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005952 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005953 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005954 PyObject *result;
5955
Guido van Rossumb8872e62000-05-09 14:14:27 +00005956 if (!PyArg_ParseTuple(args, "O|O&O&:count", &substring,
5957 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005958 return NULL;
5959
5960 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Thomas Wouters477c8d52006-05-27 19:21:47 +00005961 (PyObject *)substring);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005962 if (substring == NULL)
5963 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +00005964
Thomas Wouters477c8d52006-05-27 19:21:47 +00005965 FIX_START_END(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005966
Thomas Wouters477c8d52006-05-27 19:21:47 +00005967 result = PyInt_FromSsize_t(
5968 stringlib_count(self->str + start, end - start,
5969 substring->str, substring->length)
5970 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00005971
5972 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00005973
Guido van Rossumd57fd912000-03-10 22:53:23 +00005974 return result;
5975}
5976
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005977PyDoc_STRVAR(encode__doc__,
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00005978"S.encode([encoding[,errors]]) -> string or unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005979\n\
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00005980Encodes S using the codec registered for encoding. encoding defaults\n\
5981to the default encoding. errors may be given to set a different error\n\
Fred Drakee4315f52000-05-09 19:53:39 +00005982handling scheme. Default is 'strict' meaning that encoding errors raise\n\
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005983a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
5984'xmlcharrefreplace' as well as any other name registered with\n\
5985codecs.register_error that can handle UnicodeEncodeErrors.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005986
5987static PyObject *
5988unicode_encode(PyUnicodeObject *self, PyObject *args)
5989{
5990 char *encoding = NULL;
5991 char *errors = NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00005992 PyObject *v;
5993
Guido van Rossumd57fd912000-03-10 22:53:23 +00005994 if (!PyArg_ParseTuple(args, "|ss:encode", &encoding, &errors))
5995 return NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00005996 v = PyUnicode_AsEncodedObject((PyObject *)self, encoding, errors);
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00005997 if (v == NULL)
5998 goto onError;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00005999 if (!PyBytes_Check(v)) {
Guido van Rossum4355a472007-05-04 05:00:04 +00006000 if (PyString_Check(v)) {
6001 /* Old codec, turn it into bytes */
6002 PyObject *b = PyBytes_FromObject(v);
6003 Py_DECREF(v);
6004 return b;
6005 }
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006006 PyErr_Format(PyExc_TypeError,
Guido van Rossumf15a29f2007-05-04 00:41:39 +00006007 "encoder did not return a bytes object "
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006008 "(type=%.400s)",
6009 v->ob_type->tp_name);
6010 Py_DECREF(v);
6011 return NULL;
6012 }
6013 return v;
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00006014
6015 onError:
6016 return NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006017}
6018
6019PyDoc_STRVAR(decode__doc__,
6020"S.decode([encoding[,errors]]) -> string or unicode\n\
6021\n\
6022Decodes S using the codec registered for encoding. encoding defaults\n\
6023to the default encoding. errors may be given to set a different error\n\
6024handling scheme. Default is 'strict' meaning that encoding errors raise\n\
6025a UnicodeDecodeError. Other possible values are 'ignore' and 'replace'\n\
6026as well as any other name registerd with codecs.register_error that is\n\
6027able to handle UnicodeDecodeErrors.");
6028
6029static PyObject *
Marc-André Lemburg126b44c2004-07-10 12:04:20 +00006030unicode_decode(PyUnicodeObject *self, PyObject *args)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006031{
6032 char *encoding = NULL;
6033 char *errors = NULL;
6034 PyObject *v;
6035
6036 if (!PyArg_ParseTuple(args, "|ss:decode", &encoding, &errors))
6037 return NULL;
6038 v = PyUnicode_AsDecodedObject((PyObject *)self, encoding, errors);
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00006039 if (v == NULL)
6040 goto onError;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006041 if (!PyString_Check(v) && !PyUnicode_Check(v)) {
6042 PyErr_Format(PyExc_TypeError,
6043 "decoder did not return a string/unicode object "
6044 "(type=%.400s)",
6045 v->ob_type->tp_name);
6046 Py_DECREF(v);
6047 return NULL;
6048 }
6049 return v;
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00006050
6051 onError:
6052 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006053}
6054
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006055PyDoc_STRVAR(expandtabs__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006056"S.expandtabs([tabsize]) -> unicode\n\
6057\n\
6058Return a copy of S where all tab characters are expanded using spaces.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006059If tabsize is not given, a tab size of 8 characters is assumed.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006060
6061static PyObject*
6062unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
6063{
6064 Py_UNICODE *e;
6065 Py_UNICODE *p;
6066 Py_UNICODE *q;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006067 Py_ssize_t i, j;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006068 PyUnicodeObject *u;
6069 int tabsize = 8;
6070
6071 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
6072 return NULL;
6073
Thomas Wouters7e474022000-07-16 12:04:32 +00006074 /* First pass: determine size of output string */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006075 i = j = 0;
6076 e = self->str + self->length;
6077 for (p = self->str; p < e; p++)
6078 if (*p == '\t') {
6079 if (tabsize > 0)
6080 j += tabsize - (j % tabsize);
6081 }
6082 else {
6083 j++;
6084 if (*p == '\n' || *p == '\r') {
6085 i += j;
6086 j = 0;
6087 }
6088 }
6089
6090 /* Second pass: create output string and fill it */
6091 u = _PyUnicode_New(i + j);
6092 if (!u)
6093 return NULL;
6094
6095 j = 0;
6096 q = u->str;
6097
6098 for (p = self->str; p < e; p++)
6099 if (*p == '\t') {
6100 if (tabsize > 0) {
6101 i = tabsize - (j % tabsize);
6102 j += i;
6103 while (i--)
6104 *q++ = ' ';
6105 }
6106 }
6107 else {
6108 j++;
6109 *q++ = *p;
6110 if (*p == '\n' || *p == '\r')
6111 j = 0;
6112 }
6113
6114 return (PyObject*) u;
6115}
6116
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006117PyDoc_STRVAR(find__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006118"S.find(sub [,start [,end]]) -> int\n\
6119\n\
6120Return the lowest index in S where substring sub is found,\n\
6121such that sub is contained within s[start,end]. Optional\n\
6122arguments start and end are interpreted as in slice notation.\n\
6123\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006124Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006125
6126static PyObject *
6127unicode_find(PyUnicodeObject *self, PyObject *args)
6128{
Thomas Wouters477c8d52006-05-27 19:21:47 +00006129 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006130 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006131 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006132 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006133
Guido van Rossumb8872e62000-05-09 14:14:27 +00006134 if (!PyArg_ParseTuple(args, "O|O&O&:find", &substring,
6135 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006136 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006137 substring = PyUnicode_FromObject(substring);
6138 if (!substring)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006139 return NULL;
6140
Thomas Wouters477c8d52006-05-27 19:21:47 +00006141 result = stringlib_find_slice(
6142 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
6143 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
6144 start, end
6145 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006146
6147 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00006148
6149 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006150}
6151
6152static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +00006153unicode_getitem(PyUnicodeObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006154{
6155 if (index < 0 || index >= self->length) {
6156 PyErr_SetString(PyExc_IndexError, "string index out of range");
6157 return NULL;
6158 }
6159
6160 return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1);
6161}
6162
6163static long
Guido van Rossum09dc34f2007-05-04 04:17:33 +00006164unicode_hash(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006165{
Guido van Rossum09dc34f2007-05-04 04:17:33 +00006166 /* Since Unicode objects compare equal to their UTF-8 string
6167 counterparts, we hash the UTF-8 string. */
6168 PyObject *v = _PyUnicode_AsDefaultEncodedString(self, NULL);
6169 return PyObject_Hash(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006170}
6171
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006172PyDoc_STRVAR(index__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006173"S.index(sub [,start [,end]]) -> int\n\
6174\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006175Like S.find() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006176
6177static PyObject *
6178unicode_index(PyUnicodeObject *self, PyObject *args)
6179{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006180 Py_ssize_t result;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006181 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006182 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006183 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006184
Guido van Rossumb8872e62000-05-09 14:14:27 +00006185 if (!PyArg_ParseTuple(args, "O|O&O&:index", &substring,
6186 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006187 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006188 substring = PyUnicode_FromObject(substring);
6189 if (!substring)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006190 return NULL;
6191
Thomas Wouters477c8d52006-05-27 19:21:47 +00006192 result = stringlib_find_slice(
6193 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
6194 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
6195 start, end
6196 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006197
6198 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00006199
Guido van Rossumd57fd912000-03-10 22:53:23 +00006200 if (result < 0) {
6201 PyErr_SetString(PyExc_ValueError, "substring not found");
6202 return NULL;
6203 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00006204
Martin v. Löwis18e16552006-02-15 17:27:45 +00006205 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006206}
6207
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006208PyDoc_STRVAR(islower__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006209"S.islower() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006210\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00006211Return True if all cased characters in S are lowercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006212at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006213
6214static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006215unicode_islower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006216{
6217 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6218 register const Py_UNICODE *e;
6219 int cased;
6220
Guido van Rossumd57fd912000-03-10 22:53:23 +00006221 /* Shortcut for single character strings */
6222 if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006223 return PyBool_FromLong(Py_UNICODE_ISLOWER(*p));
Guido van Rossumd57fd912000-03-10 22:53:23 +00006224
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006225 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006226 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006227 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006228
Guido van Rossumd57fd912000-03-10 22:53:23 +00006229 e = p + PyUnicode_GET_SIZE(self);
6230 cased = 0;
6231 for (; p < e; p++) {
6232 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00006233
Guido van Rossumd57fd912000-03-10 22:53:23 +00006234 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006235 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006236 else if (!cased && Py_UNICODE_ISLOWER(ch))
6237 cased = 1;
6238 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006239 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006240}
6241
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006242PyDoc_STRVAR(isupper__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006243"S.isupper() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006244\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006245Return True if all cased characters in S are uppercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006246at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006247
6248static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006249unicode_isupper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006250{
6251 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6252 register const Py_UNICODE *e;
6253 int cased;
6254
Guido van Rossumd57fd912000-03-10 22:53:23 +00006255 /* Shortcut for single character strings */
6256 if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006257 return PyBool_FromLong(Py_UNICODE_ISUPPER(*p) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006258
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006259 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006260 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006261 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006262
Guido van Rossumd57fd912000-03-10 22:53:23 +00006263 e = p + PyUnicode_GET_SIZE(self);
6264 cased = 0;
6265 for (; p < e; p++) {
6266 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00006267
Guido van Rossumd57fd912000-03-10 22:53:23 +00006268 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006269 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006270 else if (!cased && Py_UNICODE_ISUPPER(ch))
6271 cased = 1;
6272 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006273 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006274}
6275
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006276PyDoc_STRVAR(istitle__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006277"S.istitle() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006278\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006279Return True if S is a titlecased string and there is at least one\n\
6280character in S, i.e. upper- and titlecase characters may only\n\
6281follow uncased characters and lowercase characters only cased ones.\n\
6282Return False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006283
6284static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006285unicode_istitle(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006286{
6287 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6288 register const Py_UNICODE *e;
6289 int cased, previous_is_cased;
6290
Guido van Rossumd57fd912000-03-10 22:53:23 +00006291 /* Shortcut for single character strings */
6292 if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006293 return PyBool_FromLong((Py_UNICODE_ISTITLE(*p) != 0) ||
6294 (Py_UNICODE_ISUPPER(*p) != 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +00006295
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006296 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006297 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006298 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006299
Guido van Rossumd57fd912000-03-10 22:53:23 +00006300 e = p + PyUnicode_GET_SIZE(self);
6301 cased = 0;
6302 previous_is_cased = 0;
6303 for (; p < e; p++) {
6304 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00006305
Guido van Rossumd57fd912000-03-10 22:53:23 +00006306 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
6307 if (previous_is_cased)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006308 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006309 previous_is_cased = 1;
6310 cased = 1;
6311 }
6312 else if (Py_UNICODE_ISLOWER(ch)) {
6313 if (!previous_is_cased)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006314 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006315 previous_is_cased = 1;
6316 cased = 1;
6317 }
6318 else
6319 previous_is_cased = 0;
6320 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006321 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006322}
6323
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006324PyDoc_STRVAR(isspace__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006325"S.isspace() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006326\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006327Return True if all characters in S are whitespace\n\
6328and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006329
6330static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006331unicode_isspace(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006332{
6333 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6334 register const Py_UNICODE *e;
6335
Guido van Rossumd57fd912000-03-10 22:53:23 +00006336 /* Shortcut for single character strings */
6337 if (PyUnicode_GET_SIZE(self) == 1 &&
6338 Py_UNICODE_ISSPACE(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006339 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006340
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006341 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006342 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006343 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006344
Guido van Rossumd57fd912000-03-10 22:53:23 +00006345 e = p + PyUnicode_GET_SIZE(self);
6346 for (; p < e; p++) {
6347 if (!Py_UNICODE_ISSPACE(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006348 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006349 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006350 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006351}
6352
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006353PyDoc_STRVAR(isalpha__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006354"S.isalpha() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006355\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006356Return True if all characters in S are alphabetic\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006357and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006358
6359static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006360unicode_isalpha(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006361{
6362 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6363 register const Py_UNICODE *e;
6364
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006365 /* Shortcut for single character strings */
6366 if (PyUnicode_GET_SIZE(self) == 1 &&
6367 Py_UNICODE_ISALPHA(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006368 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006369
6370 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006371 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006372 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006373
6374 e = p + PyUnicode_GET_SIZE(self);
6375 for (; p < e; p++) {
6376 if (!Py_UNICODE_ISALPHA(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006377 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006378 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006379 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006380}
6381
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006382PyDoc_STRVAR(isalnum__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006383"S.isalnum() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006384\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006385Return True if all characters in S are alphanumeric\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006386and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006387
6388static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006389unicode_isalnum(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006390{
6391 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6392 register const Py_UNICODE *e;
6393
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006394 /* Shortcut for single character strings */
6395 if (PyUnicode_GET_SIZE(self) == 1 &&
6396 Py_UNICODE_ISALNUM(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006397 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006398
6399 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006400 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006401 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006402
6403 e = p + PyUnicode_GET_SIZE(self);
6404 for (; p < e; p++) {
6405 if (!Py_UNICODE_ISALNUM(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006406 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006407 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006408 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006409}
6410
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006411PyDoc_STRVAR(isdecimal__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006412"S.isdecimal() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006413\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00006414Return True if there are only decimal characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006415False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006416
6417static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006418unicode_isdecimal(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006419{
6420 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6421 register const Py_UNICODE *e;
6422
Guido van Rossumd57fd912000-03-10 22:53:23 +00006423 /* Shortcut for single character strings */
6424 if (PyUnicode_GET_SIZE(self) == 1 &&
6425 Py_UNICODE_ISDECIMAL(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006426 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006427
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006428 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006429 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006430 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006431
Guido van Rossumd57fd912000-03-10 22:53:23 +00006432 e = p + PyUnicode_GET_SIZE(self);
6433 for (; p < e; p++) {
6434 if (!Py_UNICODE_ISDECIMAL(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006435 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006436 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006437 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006438}
6439
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006440PyDoc_STRVAR(isdigit__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006441"S.isdigit() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006442\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006443Return True if all characters in S are digits\n\
6444and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006445
6446static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006447unicode_isdigit(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006448{
6449 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6450 register const Py_UNICODE *e;
6451
Guido van Rossumd57fd912000-03-10 22:53:23 +00006452 /* Shortcut for single character strings */
6453 if (PyUnicode_GET_SIZE(self) == 1 &&
6454 Py_UNICODE_ISDIGIT(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006455 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006456
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006457 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006458 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006459 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006460
Guido van Rossumd57fd912000-03-10 22:53:23 +00006461 e = p + PyUnicode_GET_SIZE(self);
6462 for (; p < e; p++) {
6463 if (!Py_UNICODE_ISDIGIT(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006464 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006465 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006466 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006467}
6468
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006469PyDoc_STRVAR(isnumeric__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006470"S.isnumeric() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006471\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00006472Return True if there are only numeric characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006473False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006474
6475static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006476unicode_isnumeric(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006477{
6478 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6479 register const Py_UNICODE *e;
6480
Guido van Rossumd57fd912000-03-10 22:53:23 +00006481 /* Shortcut for single character strings */
6482 if (PyUnicode_GET_SIZE(self) == 1 &&
6483 Py_UNICODE_ISNUMERIC(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006484 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006485
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006486 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006487 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006488 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006489
Guido van Rossumd57fd912000-03-10 22:53:23 +00006490 e = p + PyUnicode_GET_SIZE(self);
6491 for (; p < e; p++) {
6492 if (!Py_UNICODE_ISNUMERIC(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006493 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006494 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006495 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006496}
6497
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006498PyDoc_STRVAR(join__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006499"S.join(sequence) -> unicode\n\
6500\n\
6501Return a string which is the concatenation of the strings in the\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006502sequence. The separator between elements is S.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006503
6504static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006505unicode_join(PyObject *self, PyObject *data)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006506{
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006507 return PyUnicode_Join(self, data);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006508}
6509
Martin v. Löwis18e16552006-02-15 17:27:45 +00006510static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +00006511unicode_length(PyUnicodeObject *self)
6512{
6513 return self->length;
6514}
6515
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006516PyDoc_STRVAR(ljust__doc__,
Hye-Shik Chang974ed7c2004-06-02 16:49:17 +00006517"S.ljust(width[, fillchar]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006518\n\
6519Return S left justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006520done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006521
6522static PyObject *
6523unicode_ljust(PyUnicodeObject *self, PyObject *args)
6524{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006525 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006526 Py_UNICODE fillchar = ' ';
6527
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006528 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006529 return NULL;
6530
Tim Peters7a29bd52001-09-12 03:03:31 +00006531 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006532 Py_INCREF(self);
6533 return (PyObject*) self;
6534 }
6535
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006536 return (PyObject*) pad(self, 0, width - self->length, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006537}
6538
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006539PyDoc_STRVAR(lower__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006540"S.lower() -> unicode\n\
6541\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006542Return a copy of the string S converted to lowercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006543
6544static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006545unicode_lower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006546{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006547 return fixup(self, fixlower);
6548}
6549
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006550#define LEFTSTRIP 0
6551#define RIGHTSTRIP 1
6552#define BOTHSTRIP 2
6553
6554/* Arrays indexed by above */
6555static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
6556
6557#define STRIPNAME(i) (stripformat[i]+3)
6558
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006559/* externally visible for str.strip(unicode) */
6560PyObject *
6561_PyUnicode_XStrip(PyUnicodeObject *self, int striptype, PyObject *sepobj)
6562{
6563 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
Martin v. Löwis18e16552006-02-15 17:27:45 +00006564 Py_ssize_t len = PyUnicode_GET_SIZE(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006565 Py_UNICODE *sep = PyUnicode_AS_UNICODE(sepobj);
Martin v. Löwis18e16552006-02-15 17:27:45 +00006566 Py_ssize_t seplen = PyUnicode_GET_SIZE(sepobj);
6567 Py_ssize_t i, j;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006568
Thomas Wouters477c8d52006-05-27 19:21:47 +00006569 BLOOM_MASK sepmask = make_bloom_mask(sep, seplen);
6570
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006571 i = 0;
6572 if (striptype != RIGHTSTRIP) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00006573 while (i < len && BLOOM_MEMBER(sepmask, s[i], sep, seplen)) {
6574 i++;
6575 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006576 }
6577
6578 j = len;
6579 if (striptype != LEFTSTRIP) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00006580 do {
6581 j--;
6582 } while (j >= i && BLOOM_MEMBER(sepmask, s[j], sep, seplen));
6583 j++;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006584 }
6585
6586 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00006587 Py_INCREF(self);
6588 return (PyObject*)self;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006589 }
6590 else
Thomas Wouters477c8d52006-05-27 19:21:47 +00006591 return PyUnicode_FromUnicode(s+i, j-i);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006592}
6593
Guido van Rossumd57fd912000-03-10 22:53:23 +00006594
6595static PyObject *
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006596do_strip(PyUnicodeObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006597{
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006598 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
Martin v. Löwis18e16552006-02-15 17:27:45 +00006599 Py_ssize_t len = PyUnicode_GET_SIZE(self), i, j;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006600
6601 i = 0;
6602 if (striptype != RIGHTSTRIP) {
6603 while (i < len && Py_UNICODE_ISSPACE(s[i])) {
6604 i++;
6605 }
6606 }
6607
6608 j = len;
6609 if (striptype != LEFTSTRIP) {
6610 do {
6611 j--;
6612 } while (j >= i && Py_UNICODE_ISSPACE(s[j]));
6613 j++;
6614 }
6615
6616 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
6617 Py_INCREF(self);
6618 return (PyObject*)self;
6619 }
6620 else
6621 return PyUnicode_FromUnicode(s+i, j-i);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006622}
6623
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006624
6625static PyObject *
6626do_argstrip(PyUnicodeObject *self, int striptype, PyObject *args)
6627{
6628 PyObject *sep = NULL;
6629
6630 if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
6631 return NULL;
6632
6633 if (sep != NULL && sep != Py_None) {
6634 if (PyUnicode_Check(sep))
6635 return _PyUnicode_XStrip(self, striptype, sep);
6636 else if (PyString_Check(sep)) {
6637 PyObject *res;
6638 sep = PyUnicode_FromObject(sep);
6639 if (sep==NULL)
6640 return NULL;
6641 res = _PyUnicode_XStrip(self, striptype, sep);
6642 Py_DECREF(sep);
6643 return res;
6644 }
6645 else {
6646 PyErr_Format(PyExc_TypeError,
6647 "%s arg must be None, unicode or str",
6648 STRIPNAME(striptype));
6649 return NULL;
6650 }
6651 }
6652
6653 return do_strip(self, striptype);
6654}
6655
6656
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006657PyDoc_STRVAR(strip__doc__,
Neal Norwitzffe33b72003-04-10 22:35:32 +00006658"S.strip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006659\n\
6660Return a copy of the string S with leading and trailing\n\
6661whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00006662If chars is given and not None, remove characters in chars instead.\n\
6663If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006664
6665static PyObject *
6666unicode_strip(PyUnicodeObject *self, PyObject *args)
6667{
6668 if (PyTuple_GET_SIZE(args) == 0)
6669 return do_strip(self, BOTHSTRIP); /* Common case */
6670 else
6671 return do_argstrip(self, BOTHSTRIP, args);
6672}
6673
6674
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006675PyDoc_STRVAR(lstrip__doc__,
Neal Norwitzffe33b72003-04-10 22:35:32 +00006676"S.lstrip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006677\n\
6678Return a copy of the string S with leading whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00006679If chars is given and not None, remove characters in chars instead.\n\
6680If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006681
6682static PyObject *
6683unicode_lstrip(PyUnicodeObject *self, PyObject *args)
6684{
6685 if (PyTuple_GET_SIZE(args) == 0)
6686 return do_strip(self, LEFTSTRIP); /* Common case */
6687 else
6688 return do_argstrip(self, LEFTSTRIP, args);
6689}
6690
6691
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006692PyDoc_STRVAR(rstrip__doc__,
Neal Norwitzffe33b72003-04-10 22:35:32 +00006693"S.rstrip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006694\n\
6695Return a copy of the string S with trailing whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00006696If chars is given and not None, remove characters in chars instead.\n\
6697If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006698
6699static PyObject *
6700unicode_rstrip(PyUnicodeObject *self, PyObject *args)
6701{
6702 if (PyTuple_GET_SIZE(args) == 0)
6703 return do_strip(self, RIGHTSTRIP); /* Common case */
6704 else
6705 return do_argstrip(self, RIGHTSTRIP, args);
6706}
6707
6708
Guido van Rossumd57fd912000-03-10 22:53:23 +00006709static PyObject*
Martin v. Löwis18e16552006-02-15 17:27:45 +00006710unicode_repeat(PyUnicodeObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006711{
6712 PyUnicodeObject *u;
6713 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006714 Py_ssize_t nchars;
Tim Peters8f422462000-09-09 06:13:41 +00006715 size_t nbytes;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006716
6717 if (len < 0)
6718 len = 0;
6719
Tim Peters7a29bd52001-09-12 03:03:31 +00006720 if (len == 1 && PyUnicode_CheckExact(str)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006721 /* no repeat, return original string */
6722 Py_INCREF(str);
6723 return (PyObject*) str;
6724 }
Tim Peters8f422462000-09-09 06:13:41 +00006725
6726 /* ensure # of chars needed doesn't overflow int and # of bytes
6727 * needed doesn't overflow size_t
6728 */
6729 nchars = len * str->length;
6730 if (len && nchars / len != str->length) {
6731 PyErr_SetString(PyExc_OverflowError,
6732 "repeated string is too long");
6733 return NULL;
6734 }
6735 nbytes = (nchars + 1) * sizeof(Py_UNICODE);
6736 if (nbytes / sizeof(Py_UNICODE) != (size_t)(nchars + 1)) {
6737 PyErr_SetString(PyExc_OverflowError,
6738 "repeated string is too long");
6739 return NULL;
6740 }
6741 u = _PyUnicode_New(nchars);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006742 if (!u)
6743 return NULL;
6744
6745 p = u->str;
6746
Thomas Wouters477c8d52006-05-27 19:21:47 +00006747 if (str->length == 1 && len > 0) {
6748 Py_UNICODE_FILL(p, str->str[0], len);
6749 } else {
6750 Py_ssize_t done = 0; /* number of characters copied this far */
6751 if (done < nchars) {
6752 Py_UNICODE_COPY(p, str->str, str->length);
6753 done = str->length;
6754 }
6755 while (done < nchars) {
6756 int n = (done <= nchars-done) ? done : nchars-done;
6757 Py_UNICODE_COPY(p+done, p, n);
6758 done += n;
6759 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006760 }
6761
6762 return (PyObject*) u;
6763}
6764
6765PyObject *PyUnicode_Replace(PyObject *obj,
6766 PyObject *subobj,
6767 PyObject *replobj,
Martin v. Löwis18e16552006-02-15 17:27:45 +00006768 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006769{
6770 PyObject *self;
6771 PyObject *str1;
6772 PyObject *str2;
6773 PyObject *result;
6774
6775 self = PyUnicode_FromObject(obj);
6776 if (self == NULL)
6777 return NULL;
6778 str1 = PyUnicode_FromObject(subobj);
6779 if (str1 == NULL) {
6780 Py_DECREF(self);
6781 return NULL;
6782 }
6783 str2 = PyUnicode_FromObject(replobj);
6784 if (str2 == NULL) {
6785 Py_DECREF(self);
6786 Py_DECREF(str1);
6787 return NULL;
6788 }
Tim Petersced69f82003-09-16 20:30:58 +00006789 result = replace((PyUnicodeObject *)self,
6790 (PyUnicodeObject *)str1,
6791 (PyUnicodeObject *)str2,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006792 maxcount);
6793 Py_DECREF(self);
6794 Py_DECREF(str1);
6795 Py_DECREF(str2);
6796 return result;
6797}
6798
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006799PyDoc_STRVAR(replace__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006800"S.replace (old, new[, maxsplit]) -> unicode\n\
6801\n\
6802Return a copy of S with all occurrences of substring\n\
6803old replaced by new. If the optional argument maxsplit is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006804given, only the first maxsplit occurrences are replaced.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006805
6806static PyObject*
6807unicode_replace(PyUnicodeObject *self, PyObject *args)
6808{
6809 PyUnicodeObject *str1;
6810 PyUnicodeObject *str2;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006811 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006812 PyObject *result;
6813
Martin v. Löwis18e16552006-02-15 17:27:45 +00006814 if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006815 return NULL;
6816 str1 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str1);
6817 if (str1 == NULL)
6818 return NULL;
6819 str2 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str2);
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00006820 if (str2 == NULL) {
6821 Py_DECREF(str1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006822 return NULL;
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00006823 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006824
6825 result = replace(self, str1, str2, maxcount);
6826
6827 Py_DECREF(str1);
6828 Py_DECREF(str2);
6829 return result;
6830}
6831
6832static
6833PyObject *unicode_repr(PyObject *unicode)
6834{
Walter Dörwald79e913e2007-05-12 11:08:06 +00006835 PyObject *repr;
Walter Dörwald1ab83302007-05-18 17:15:44 +00006836 Py_UNICODE *p;
Walter Dörwald79e913e2007-05-12 11:08:06 +00006837 Py_UNICODE *s = PyUnicode_AS_UNICODE(unicode);
6838 Py_ssize_t size = PyUnicode_GET_SIZE(unicode);
6839
6840 /* XXX(nnorwitz): rather than over-allocating, it would be
6841 better to choose a different scheme. Perhaps scan the
6842 first N-chars of the string and allocate based on that size.
6843 */
6844 /* Initial allocation is based on the longest-possible unichr
6845 escape.
6846
6847 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
6848 unichr, so in this case it's the longest unichr escape. In
6849 narrow (UTF-16) builds this is five chars per source unichr
6850 since there are two unichrs in the surrogate pair, so in narrow
6851 (UTF-16) builds it's not the longest unichr escape.
6852
6853 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
6854 so in the narrow (UTF-16) build case it's the longest unichr
6855 escape.
6856 */
6857
Walter Dörwald1ab83302007-05-18 17:15:44 +00006858 repr = PyUnicode_FromUnicode(NULL,
Walter Dörwald79e913e2007-05-12 11:08:06 +00006859 2 /* quotes */
6860#ifdef Py_UNICODE_WIDE
6861 + 10*size
6862#else
6863 + 6*size
6864#endif
6865 + 1);
6866 if (repr == NULL)
6867 return NULL;
6868
Walter Dörwald1ab83302007-05-18 17:15:44 +00006869 p = PyUnicode_AS_UNICODE(repr);
Walter Dörwald79e913e2007-05-12 11:08:06 +00006870
6871 /* Add quote */
6872 *p++ = (findchar(s, size, '\'') &&
6873 !findchar(s, size, '"')) ? '"' : '\'';
6874 while (size-- > 0) {
6875 Py_UNICODE ch = *s++;
6876
6877 /* Escape quotes and backslashes */
Walter Dörwald1ab83302007-05-18 17:15:44 +00006878 if ((ch == PyUnicode_AS_UNICODE(repr)[0]) || (ch == '\\')) {
Walter Dörwald79e913e2007-05-12 11:08:06 +00006879 *p++ = '\\';
Walter Dörwald1ab83302007-05-18 17:15:44 +00006880 *p++ = ch;
Walter Dörwald79e913e2007-05-12 11:08:06 +00006881 continue;
6882 }
6883
6884#ifdef Py_UNICODE_WIDE
6885 /* Map 21-bit characters to '\U00xxxxxx' */
6886 else if (ch >= 0x10000) {
6887 *p++ = '\\';
6888 *p++ = 'U';
6889 *p++ = hexdigits[(ch >> 28) & 0x0000000F];
6890 *p++ = hexdigits[(ch >> 24) & 0x0000000F];
6891 *p++ = hexdigits[(ch >> 20) & 0x0000000F];
6892 *p++ = hexdigits[(ch >> 16) & 0x0000000F];
6893 *p++ = hexdigits[(ch >> 12) & 0x0000000F];
6894 *p++ = hexdigits[(ch >> 8) & 0x0000000F];
6895 *p++ = hexdigits[(ch >> 4) & 0x0000000F];
6896 *p++ = hexdigits[ch & 0x0000000F];
6897 continue;
6898 }
6899#else
6900 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
6901 else if (ch >= 0xD800 && ch < 0xDC00) {
6902 Py_UNICODE ch2;
6903 Py_UCS4 ucs;
6904
6905 ch2 = *s++;
6906 size--;
6907 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
6908 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
6909 *p++ = '\\';
6910 *p++ = 'U';
6911 *p++ = hexdigits[(ucs >> 28) & 0x0000000F];
6912 *p++ = hexdigits[(ucs >> 24) & 0x0000000F];
6913 *p++ = hexdigits[(ucs >> 20) & 0x0000000F];
6914 *p++ = hexdigits[(ucs >> 16) & 0x0000000F];
6915 *p++ = hexdigits[(ucs >> 12) & 0x0000000F];
6916 *p++ = hexdigits[(ucs >> 8) & 0x0000000F];
6917 *p++ = hexdigits[(ucs >> 4) & 0x0000000F];
6918 *p++ = hexdigits[ucs & 0x0000000F];
6919 continue;
6920 }
6921 /* Fall through: isolated surrogates are copied as-is */
6922 s--;
6923 size++;
6924 }
6925#endif
6926
6927 /* Map 16-bit characters to '\uxxxx' */
6928 if (ch >= 256) {
6929 *p++ = '\\';
6930 *p++ = 'u';
6931 *p++ = hexdigits[(ch >> 12) & 0x000F];
6932 *p++ = hexdigits[(ch >> 8) & 0x000F];
6933 *p++ = hexdigits[(ch >> 4) & 0x000F];
6934 *p++ = hexdigits[ch & 0x000F];
6935 }
6936
6937 /* Map special whitespace to '\t', \n', '\r' */
6938 else if (ch == '\t') {
6939 *p++ = '\\';
6940 *p++ = 't';
6941 }
6942 else if (ch == '\n') {
6943 *p++ = '\\';
6944 *p++ = 'n';
6945 }
6946 else if (ch == '\r') {
6947 *p++ = '\\';
6948 *p++ = 'r';
6949 }
6950
6951 /* Map non-printable US ASCII to '\xhh' */
6952 else if (ch < ' ' || ch >= 0x7F) {
6953 *p++ = '\\';
6954 *p++ = 'x';
6955 *p++ = hexdigits[(ch >> 4) & 0x000F];
6956 *p++ = hexdigits[ch & 0x000F];
6957 }
6958
6959 /* Copy everything else as-is */
6960 else
6961 *p++ = (char) ch;
6962 }
6963 /* Add quote */
Walter Dörwald1ab83302007-05-18 17:15:44 +00006964 *p++ = PyUnicode_AS_UNICODE(repr)[0];
Walter Dörwald79e913e2007-05-12 11:08:06 +00006965
6966 *p = '\0';
Walter Dörwald1ab83302007-05-18 17:15:44 +00006967 _PyUnicode_Resize(&repr, p - PyUnicode_AS_UNICODE(repr));
Walter Dörwald79e913e2007-05-12 11:08:06 +00006968 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006969}
6970
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006971PyDoc_STRVAR(rfind__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006972"S.rfind(sub [,start [,end]]) -> int\n\
6973\n\
6974Return the highest index in S where substring sub is found,\n\
6975such that sub is contained within s[start,end]. Optional\n\
6976arguments start and end are interpreted as in slice notation.\n\
6977\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006978Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006979
6980static PyObject *
6981unicode_rfind(PyUnicodeObject *self, PyObject *args)
6982{
Thomas Wouters477c8d52006-05-27 19:21:47 +00006983 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006984 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006985 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006986 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006987
Guido van Rossumb8872e62000-05-09 14:14:27 +00006988 if (!PyArg_ParseTuple(args, "O|O&O&:rfind", &substring,
6989 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006990 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006991 substring = PyUnicode_FromObject(substring);
6992 if (!substring)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006993 return NULL;
6994
Thomas Wouters477c8d52006-05-27 19:21:47 +00006995 result = stringlib_rfind_slice(
6996 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
6997 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
6998 start, end
6999 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007000
7001 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00007002
7003 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007004}
7005
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007006PyDoc_STRVAR(rindex__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007007"S.rindex(sub [,start [,end]]) -> int\n\
7008\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007009Like S.rfind() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007010
7011static PyObject *
7012unicode_rindex(PyUnicodeObject *self, PyObject *args)
7013{
Thomas Wouters477c8d52006-05-27 19:21:47 +00007014 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007015 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007016 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007017 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007018
Guido van Rossumb8872e62000-05-09 14:14:27 +00007019 if (!PyArg_ParseTuple(args, "O|O&O&:rindex", &substring,
7020 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007021 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007022 substring = PyUnicode_FromObject(substring);
7023 if (!substring)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007024 return NULL;
7025
Thomas Wouters477c8d52006-05-27 19:21:47 +00007026 result = stringlib_rfind_slice(
7027 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
7028 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
7029 start, end
7030 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007031
7032 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00007033
Guido van Rossumd57fd912000-03-10 22:53:23 +00007034 if (result < 0) {
7035 PyErr_SetString(PyExc_ValueError, "substring not found");
7036 return NULL;
7037 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00007038 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007039}
7040
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007041PyDoc_STRVAR(rjust__doc__,
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007042"S.rjust(width[, fillchar]) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007043\n\
7044Return S right justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007045done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007046
7047static PyObject *
7048unicode_rjust(PyUnicodeObject *self, PyObject *args)
7049{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007050 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007051 Py_UNICODE fillchar = ' ';
7052
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007053 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007054 return NULL;
7055
Tim Peters7a29bd52001-09-12 03:03:31 +00007056 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007057 Py_INCREF(self);
7058 return (PyObject*) self;
7059 }
7060
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007061 return (PyObject*) pad(self, width - self->length, 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007062}
7063
Guido van Rossumd57fd912000-03-10 22:53:23 +00007064static PyObject*
Martin v. Löwis18e16552006-02-15 17:27:45 +00007065unicode_slice(PyUnicodeObject *self, Py_ssize_t start, Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007066{
7067 /* standard clamping */
7068 if (start < 0)
7069 start = 0;
7070 if (end < 0)
7071 end = 0;
7072 if (end > self->length)
7073 end = self->length;
Tim Peters7a29bd52001-09-12 03:03:31 +00007074 if (start == 0 && end == self->length && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007075 /* full slice, return original string */
7076 Py_INCREF(self);
7077 return (PyObject*) self;
7078 }
7079 if (start > end)
7080 start = end;
7081 /* copy slice */
7082 return (PyObject*) PyUnicode_FromUnicode(self->str + start,
7083 end - start);
7084}
7085
7086PyObject *PyUnicode_Split(PyObject *s,
7087 PyObject *sep,
Martin v. Löwis18e16552006-02-15 17:27:45 +00007088 Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007089{
7090 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00007091
Guido van Rossumd57fd912000-03-10 22:53:23 +00007092 s = PyUnicode_FromObject(s);
7093 if (s == NULL)
7094 return NULL;
7095 if (sep != NULL) {
7096 sep = PyUnicode_FromObject(sep);
7097 if (sep == NULL) {
7098 Py_DECREF(s);
7099 return NULL;
7100 }
7101 }
7102
7103 result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
7104
7105 Py_DECREF(s);
7106 Py_XDECREF(sep);
7107 return result;
7108}
7109
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007110PyDoc_STRVAR(split__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007111"S.split([sep [,maxsplit]]) -> list of strings\n\
7112\n\
7113Return a list of the words in S, using sep as the\n\
7114delimiter string. If maxsplit is given, at most maxsplit\n\
Thomas Hellerca0d2cb2004-09-15 11:41:32 +00007115splits are done. If sep is not specified or is None,\n\
Walter Dörwald782afc52004-09-14 09:40:45 +00007116any whitespace string is a separator.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007117
7118static PyObject*
7119unicode_split(PyUnicodeObject *self, PyObject *args)
7120{
7121 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007122 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007123
Martin v. Löwis18e16552006-02-15 17:27:45 +00007124 if (!PyArg_ParseTuple(args, "|On:split", &substring, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007125 return NULL;
7126
7127 if (substring == Py_None)
7128 return split(self, NULL, maxcount);
7129 else if (PyUnicode_Check(substring))
7130 return split(self, (PyUnicodeObject *)substring, maxcount);
7131 else
7132 return PyUnicode_Split((PyObject *)self, substring, maxcount);
7133}
7134
Thomas Wouters477c8d52006-05-27 19:21:47 +00007135PyObject *
7136PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
7137{
7138 PyObject* str_obj;
7139 PyObject* sep_obj;
7140 PyObject* out;
7141
7142 str_obj = PyUnicode_FromObject(str_in);
7143 if (!str_obj)
7144 return NULL;
7145 sep_obj = PyUnicode_FromObject(sep_in);
7146 if (!sep_obj) {
7147 Py_DECREF(str_obj);
7148 return NULL;
7149 }
7150
7151 out = stringlib_partition(
7152 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
7153 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
7154 );
7155
7156 Py_DECREF(sep_obj);
7157 Py_DECREF(str_obj);
7158
7159 return out;
7160}
7161
7162
7163PyObject *
7164PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
7165{
7166 PyObject* str_obj;
7167 PyObject* sep_obj;
7168 PyObject* out;
7169
7170 str_obj = PyUnicode_FromObject(str_in);
7171 if (!str_obj)
7172 return NULL;
7173 sep_obj = PyUnicode_FromObject(sep_in);
7174 if (!sep_obj) {
7175 Py_DECREF(str_obj);
7176 return NULL;
7177 }
7178
7179 out = stringlib_rpartition(
7180 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
7181 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
7182 );
7183
7184 Py_DECREF(sep_obj);
7185 Py_DECREF(str_obj);
7186
7187 return out;
7188}
7189
7190PyDoc_STRVAR(partition__doc__,
7191"S.partition(sep) -> (head, sep, tail)\n\
7192\n\
7193Searches for the separator sep in S, and returns the part before it,\n\
7194the separator itself, and the part after it. If the separator is not\n\
7195found, returns S and two empty strings.");
7196
7197static PyObject*
7198unicode_partition(PyUnicodeObject *self, PyObject *separator)
7199{
7200 return PyUnicode_Partition((PyObject *)self, separator);
7201}
7202
7203PyDoc_STRVAR(rpartition__doc__,
Thomas Wouters89f507f2006-12-13 04:49:30 +00007204"S.rpartition(sep) -> (tail, sep, head)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00007205\n\
7206Searches for the separator sep in S, starting at the end of S, and returns\n\
7207the part before it, the separator itself, and the part after it. If the\n\
Thomas Wouters89f507f2006-12-13 04:49:30 +00007208separator is not found, returns two empty strings and S.");
Thomas Wouters477c8d52006-05-27 19:21:47 +00007209
7210static PyObject*
7211unicode_rpartition(PyUnicodeObject *self, PyObject *separator)
7212{
7213 return PyUnicode_RPartition((PyObject *)self, separator);
7214}
7215
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007216PyObject *PyUnicode_RSplit(PyObject *s,
7217 PyObject *sep,
Martin v. Löwis18e16552006-02-15 17:27:45 +00007218 Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007219{
7220 PyObject *result;
7221
7222 s = PyUnicode_FromObject(s);
7223 if (s == NULL)
7224 return NULL;
7225 if (sep != NULL) {
7226 sep = PyUnicode_FromObject(sep);
7227 if (sep == NULL) {
7228 Py_DECREF(s);
7229 return NULL;
7230 }
7231 }
7232
7233 result = rsplit((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
7234
7235 Py_DECREF(s);
7236 Py_XDECREF(sep);
7237 return result;
7238}
7239
7240PyDoc_STRVAR(rsplit__doc__,
7241"S.rsplit([sep [,maxsplit]]) -> list of strings\n\
7242\n\
7243Return a list of the words in S, using sep as the\n\
7244delimiter string, starting at the end of the string and\n\
7245working to the front. If maxsplit is given, at most maxsplit\n\
7246splits are done. If sep is not specified, any whitespace string\n\
7247is a separator.");
7248
7249static PyObject*
7250unicode_rsplit(PyUnicodeObject *self, PyObject *args)
7251{
7252 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007253 Py_ssize_t maxcount = -1;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007254
Martin v. Löwis18e16552006-02-15 17:27:45 +00007255 if (!PyArg_ParseTuple(args, "|On:rsplit", &substring, &maxcount))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007256 return NULL;
7257
7258 if (substring == Py_None)
7259 return rsplit(self, NULL, maxcount);
7260 else if (PyUnicode_Check(substring))
7261 return rsplit(self, (PyUnicodeObject *)substring, maxcount);
7262 else
7263 return PyUnicode_RSplit((PyObject *)self, substring, maxcount);
7264}
7265
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007266PyDoc_STRVAR(splitlines__doc__,
Guido van Rossum86662912000-04-11 15:38:46 +00007267"S.splitlines([keepends]]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007268\n\
7269Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +00007270Line breaks are not included in the resulting list unless keepends\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007271is given and true.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007272
7273static PyObject*
7274unicode_splitlines(PyUnicodeObject *self, PyObject *args)
7275{
Guido van Rossum86662912000-04-11 15:38:46 +00007276 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007277
Guido van Rossum86662912000-04-11 15:38:46 +00007278 if (!PyArg_ParseTuple(args, "|i:splitlines", &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007279 return NULL;
7280
Guido van Rossum86662912000-04-11 15:38:46 +00007281 return PyUnicode_Splitlines((PyObject *)self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007282}
7283
7284static
Guido van Rossumf15a29f2007-05-04 00:41:39 +00007285PyObject *unicode_str(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007286{
Guido van Rossumf15a29f2007-05-04 00:41:39 +00007287 PyObject *res = _PyUnicode_AsDefaultEncodedString(self, NULL);
7288 Py_XINCREF(res);
7289 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007290}
7291
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007292PyDoc_STRVAR(swapcase__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007293"S.swapcase() -> unicode\n\
7294\n\
7295Return a copy of S with uppercase characters converted to lowercase\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007296and vice versa.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007297
7298static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007299unicode_swapcase(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007300{
Guido van Rossumd57fd912000-03-10 22:53:23 +00007301 return fixup(self, fixswapcase);
7302}
7303
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007304PyDoc_STRVAR(translate__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007305"S.translate(table) -> unicode\n\
7306\n\
7307Return a copy of the string S, where all characters have been mapped\n\
7308through the given translation table, which must be a mapping of\n\
Walter Dörwald5c1ee172002-09-04 20:31:32 +00007309Unicode ordinals to Unicode ordinals, Unicode strings or None.\n\
7310Unmapped characters are left untouched. Characters mapped to None\n\
7311are deleted.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007312
7313static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007314unicode_translate(PyUnicodeObject *self, PyObject *table)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007315{
Tim Petersced69f82003-09-16 20:30:58 +00007316 return PyUnicode_TranslateCharmap(self->str,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007317 self->length,
Tim Petersced69f82003-09-16 20:30:58 +00007318 table,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007319 "ignore");
7320}
7321
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007322PyDoc_STRVAR(upper__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007323"S.upper() -> unicode\n\
7324\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007325Return a copy of S converted to uppercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007326
7327static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007328unicode_upper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007329{
Guido van Rossumd57fd912000-03-10 22:53:23 +00007330 return fixup(self, fixupper);
7331}
7332
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007333PyDoc_STRVAR(zfill__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007334"S.zfill(width) -> unicode\n\
7335\n\
7336Pad a numeric string x with zeros on the left, to fill a field\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007337of the specified width. The string x is never truncated.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007338
7339static PyObject *
7340unicode_zfill(PyUnicodeObject *self, PyObject *args)
7341{
Martin v. Löwis18e16552006-02-15 17:27:45 +00007342 Py_ssize_t fill;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007343 PyUnicodeObject *u;
7344
Martin v. Löwis18e16552006-02-15 17:27:45 +00007345 Py_ssize_t width;
7346 if (!PyArg_ParseTuple(args, "n:zfill", &width))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007347 return NULL;
7348
7349 if (self->length >= width) {
Walter Dörwald0fe940c2002-04-15 18:42:15 +00007350 if (PyUnicode_CheckExact(self)) {
7351 Py_INCREF(self);
7352 return (PyObject*) self;
7353 }
7354 else
7355 return PyUnicode_FromUnicode(
7356 PyUnicode_AS_UNICODE(self),
7357 PyUnicode_GET_SIZE(self)
7358 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007359 }
7360
7361 fill = width - self->length;
7362
7363 u = pad(self, fill, 0, '0');
7364
Walter Dörwald068325e2002-04-15 13:36:47 +00007365 if (u == NULL)
7366 return NULL;
7367
Guido van Rossumd57fd912000-03-10 22:53:23 +00007368 if (u->str[fill] == '+' || u->str[fill] == '-') {
7369 /* move sign to beginning of string */
7370 u->str[0] = u->str[fill];
7371 u->str[fill] = '0';
7372 }
7373
7374 return (PyObject*) u;
7375}
Guido van Rossumd57fd912000-03-10 22:53:23 +00007376
7377#if 0
7378static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007379unicode_freelistsize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007380{
Guido van Rossumd57fd912000-03-10 22:53:23 +00007381 return PyInt_FromLong(unicode_freelist_size);
7382}
7383#endif
7384
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007385PyDoc_STRVAR(startswith__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00007386"S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007387\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00007388Return True if S starts with the specified prefix, False otherwise.\n\
7389With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007390With optional end, stop comparing S at that position.\n\
7391prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007392
7393static PyObject *
7394unicode_startswith(PyUnicodeObject *self,
7395 PyObject *args)
7396{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007397 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007398 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007399 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007400 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007401 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007402
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007403 if (!PyArg_ParseTuple(args, "O|O&O&:startswith", &subobj,
Guido van Rossumb8872e62000-05-09 14:14:27 +00007404 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007405 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007406 if (PyTuple_Check(subobj)) {
7407 Py_ssize_t i;
7408 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
7409 substring = (PyUnicodeObject *)PyUnicode_FromObject(
7410 PyTuple_GET_ITEM(subobj, i));
7411 if (substring == NULL)
7412 return NULL;
7413 result = tailmatch(self, substring, start, end, -1);
7414 Py_DECREF(substring);
7415 if (result) {
7416 Py_RETURN_TRUE;
7417 }
7418 }
7419 /* nothing matched */
7420 Py_RETURN_FALSE;
7421 }
7422 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007423 if (substring == NULL)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007424 return NULL;
7425 result = tailmatch(self, substring, start, end, -1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007426 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007427 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007428}
7429
7430
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007431PyDoc_STRVAR(endswith__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00007432"S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007433\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00007434Return True if S ends with the specified suffix, False otherwise.\n\
7435With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007436With optional end, stop comparing S at that position.\n\
7437suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007438
7439static PyObject *
7440unicode_endswith(PyUnicodeObject *self,
7441 PyObject *args)
7442{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007443 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007444 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007445 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007446 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007447 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007448
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007449 if (!PyArg_ParseTuple(args, "O|O&O&:endswith", &subobj,
7450 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007451 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007452 if (PyTuple_Check(subobj)) {
7453 Py_ssize_t i;
7454 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
7455 substring = (PyUnicodeObject *)PyUnicode_FromObject(
7456 PyTuple_GET_ITEM(subobj, i));
7457 if (substring == NULL)
7458 return NULL;
7459 result = tailmatch(self, substring, start, end, +1);
7460 Py_DECREF(substring);
7461 if (result) {
7462 Py_RETURN_TRUE;
7463 }
7464 }
7465 Py_RETURN_FALSE;
7466 }
7467 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007468 if (substring == NULL)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007469 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007470
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007471 result = tailmatch(self, substring, start, end, +1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007472 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007473 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007474}
7475
7476
Guido van Rossum5d9113d2003-01-29 17:58:45 +00007477
7478static PyObject *
7479unicode_getnewargs(PyUnicodeObject *v)
7480{
7481 return Py_BuildValue("(u#)", v->str, v->length);
7482}
7483
7484
Guido van Rossumd57fd912000-03-10 22:53:23 +00007485static PyMethodDef unicode_methods[] = {
7486
7487 /* Order is according to common usage: often used methods should
7488 appear first, since lookup is done sequentially. */
7489
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007490 {"encode", (PyCFunction) unicode_encode, METH_VARARGS, encode__doc__},
7491 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
7492 {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007493 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007494 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
7495 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
7496 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
7497 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
7498 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
7499 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
7500 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +00007501 {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007502 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
7503 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
7504 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007505 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00007506 {"decode", (PyCFunction) unicode_decode, METH_VARARGS, decode__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007507/* {"maketrans", (PyCFunction) unicode_maketrans, METH_VARARGS, maketrans__doc__}, */
7508 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
7509 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
7510 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007511 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +00007512 {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007513 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS, splitlines__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007514 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007515 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
7516 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
7517 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
7518 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
7519 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
7520 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
7521 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
7522 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
7523 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
7524 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
7525 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
7526 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
7527 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
7528 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007529 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
Walter Dörwald068325e2002-04-15 13:36:47 +00007530#if 0
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007531 {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
Guido van Rossumd57fd912000-03-10 22:53:23 +00007532#endif
7533
7534#if 0
7535 /* This one is just used for debugging the implementation. */
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007536 {"freelistsize", (PyCFunction) unicode_freelistsize, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00007537#endif
7538
Guido van Rossum5d9113d2003-01-29 17:58:45 +00007539 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00007540 {NULL, NULL}
7541};
7542
Neil Schemenauerce30bc92002-11-18 16:10:18 +00007543static PyObject *
7544unicode_mod(PyObject *v, PyObject *w)
7545{
7546 if (!PyUnicode_Check(v)) {
7547 Py_INCREF(Py_NotImplemented);
7548 return Py_NotImplemented;
7549 }
7550 return PyUnicode_Format(v, w);
7551}
7552
7553static PyNumberMethods unicode_as_number = {
7554 0, /*nb_add*/
7555 0, /*nb_subtract*/
7556 0, /*nb_multiply*/
Neil Schemenauerce30bc92002-11-18 16:10:18 +00007557 unicode_mod, /*nb_remainder*/
7558};
7559
Guido van Rossumd57fd912000-03-10 22:53:23 +00007560static PySequenceMethods unicode_as_sequence = {
Martin v. Löwis18e16552006-02-15 17:27:45 +00007561 (lenfunc) unicode_length, /* sq_length */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007562 PyUnicode_Concat, /* sq_concat */
Martin v. Löwis18e16552006-02-15 17:27:45 +00007563 (ssizeargfunc) unicode_repeat, /* sq_repeat */
7564 (ssizeargfunc) unicode_getitem, /* sq_item */
7565 (ssizessizeargfunc) unicode_slice, /* sq_slice */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007566 0, /* sq_ass_item */
7567 0, /* sq_ass_slice */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007568 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007569};
7570
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007571static PyObject*
7572unicode_subscript(PyUnicodeObject* self, PyObject* item)
7573{
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00007574 if (PyIndex_Check(item)) {
7575 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007576 if (i == -1 && PyErr_Occurred())
7577 return NULL;
7578 if (i < 0)
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007579 i += PyUnicode_GET_SIZE(self);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007580 return unicode_getitem(self, i);
7581 } else if (PySlice_Check(item)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00007582 Py_ssize_t start, stop, step, slicelength, cur, i;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007583 Py_UNICODE* source_buf;
7584 Py_UNICODE* result_buf;
7585 PyObject* result;
7586
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007587 if (PySlice_GetIndicesEx((PySliceObject*)item, PyUnicode_GET_SIZE(self),
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007588 &start, &stop, &step, &slicelength) < 0) {
7589 return NULL;
7590 }
7591
7592 if (slicelength <= 0) {
7593 return PyUnicode_FromUnicode(NULL, 0);
7594 } else {
7595 source_buf = PyUnicode_AS_UNICODE((PyObject*)self);
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007596 result_buf = (Py_UNICODE *)PyMem_MALLOC(slicelength*
7597 sizeof(Py_UNICODE));
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007598
7599 if (result_buf == NULL)
7600 return PyErr_NoMemory();
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007601
7602 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
7603 result_buf[i] = source_buf[cur];
7604 }
Tim Petersced69f82003-09-16 20:30:58 +00007605
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007606 result = PyUnicode_FromUnicode(result_buf, slicelength);
7607 PyMem_FREE(result_buf);
7608 return result;
7609 }
7610 } else {
7611 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
7612 return NULL;
7613 }
7614}
7615
7616static PyMappingMethods unicode_as_mapping = {
Martin v. Löwis18e16552006-02-15 17:27:45 +00007617 (lenfunc)unicode_length, /* mp_length */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007618 (binaryfunc)unicode_subscript, /* mp_subscript */
7619 (objobjargproc)0, /* mp_ass_subscript */
7620};
7621
Martin v. Löwis18e16552006-02-15 17:27:45 +00007622static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +00007623unicode_buffer_getreadbuf(PyUnicodeObject *self,
Martin v. Löwis18e16552006-02-15 17:27:45 +00007624 Py_ssize_t index,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007625 const void **ptr)
7626{
7627 if (index != 0) {
7628 PyErr_SetString(PyExc_SystemError,
7629 "accessing non-existent unicode segment");
7630 return -1;
7631 }
7632 *ptr = (void *) self->str;
7633 return PyUnicode_GET_DATA_SIZE(self);
7634}
7635
Martin v. Löwis18e16552006-02-15 17:27:45 +00007636static Py_ssize_t
7637unicode_buffer_getwritebuf(PyUnicodeObject *self, Py_ssize_t index,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007638 const void **ptr)
7639{
7640 PyErr_SetString(PyExc_TypeError,
Neal Norwitz20e72132002-06-13 21:25:17 +00007641 "cannot use unicode as modifiable buffer");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007642 return -1;
7643}
7644
7645static int
7646unicode_buffer_getsegcount(PyUnicodeObject *self,
Martin v. Löwis18e16552006-02-15 17:27:45 +00007647 Py_ssize_t *lenp)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007648{
7649 if (lenp)
7650 *lenp = PyUnicode_GET_DATA_SIZE(self);
7651 return 1;
7652}
7653
Martin v. Löwiseb079f12006-02-16 14:32:27 +00007654static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +00007655unicode_buffer_getcharbuf(PyUnicodeObject *self,
Martin v. Löwis18e16552006-02-15 17:27:45 +00007656 Py_ssize_t index,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007657 const void **ptr)
7658{
7659 PyObject *str;
Tim Petersced69f82003-09-16 20:30:58 +00007660
Guido van Rossumd57fd912000-03-10 22:53:23 +00007661 if (index != 0) {
7662 PyErr_SetString(PyExc_SystemError,
7663 "accessing non-existent unicode segment");
7664 return -1;
7665 }
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00007666 str = _PyUnicode_AsDefaultEncodedString((PyObject *)self, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007667 if (str == NULL)
7668 return -1;
7669 *ptr = (void *) PyString_AS_STRING(str);
7670 return PyString_GET_SIZE(str);
7671}
7672
7673/* Helpers for PyUnicode_Format() */
7674
7675static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +00007676getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007677{
Martin v. Löwis18e16552006-02-15 17:27:45 +00007678 Py_ssize_t argidx = *p_argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007679 if (argidx < arglen) {
7680 (*p_argidx)++;
7681 if (arglen < 0)
7682 return args;
7683 else
7684 return PyTuple_GetItem(args, argidx);
7685 }
7686 PyErr_SetString(PyExc_TypeError,
7687 "not enough arguments for format string");
7688 return NULL;
7689}
7690
7691#define F_LJUST (1<<0)
7692#define F_SIGN (1<<1)
7693#define F_BLANK (1<<2)
7694#define F_ALT (1<<3)
7695#define F_ZERO (1<<4)
7696
Martin v. Löwis18e16552006-02-15 17:27:45 +00007697static Py_ssize_t
Neal Norwitzfc76d632006-01-10 06:03:13 +00007698strtounicode(Py_UNICODE *buffer, const char *charbuffer)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007699{
Martin v. Löwis18e16552006-02-15 17:27:45 +00007700 register Py_ssize_t i;
7701 Py_ssize_t len = strlen(charbuffer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007702 for (i = len - 1; i >= 0; i--)
7703 buffer[i] = (Py_UNICODE) charbuffer[i];
7704
Guido van Rossumd57fd912000-03-10 22:53:23 +00007705 return len;
7706}
7707
Neal Norwitzfc76d632006-01-10 06:03:13 +00007708static int
7709doubletounicode(Py_UNICODE *buffer, size_t len, const char *format, double x)
7710{
Tim Peters15231542006-02-16 01:08:01 +00007711 Py_ssize_t result;
7712
Neal Norwitzfc76d632006-01-10 06:03:13 +00007713 PyOS_ascii_formatd((char *)buffer, len, format, x);
Tim Peters15231542006-02-16 01:08:01 +00007714 result = strtounicode(buffer, (char *)buffer);
7715 return Py_SAFE_DOWNCAST(result, Py_ssize_t, int);
Neal Norwitzfc76d632006-01-10 06:03:13 +00007716}
7717
7718static int
7719longtounicode(Py_UNICODE *buffer, size_t len, const char *format, long x)
7720{
Tim Peters15231542006-02-16 01:08:01 +00007721 Py_ssize_t result;
7722
Neal Norwitzfc76d632006-01-10 06:03:13 +00007723 PyOS_snprintf((char *)buffer, len, format, x);
Tim Peters15231542006-02-16 01:08:01 +00007724 result = strtounicode(buffer, (char *)buffer);
7725 return Py_SAFE_DOWNCAST(result, Py_ssize_t, int);
Neal Norwitzfc76d632006-01-10 06:03:13 +00007726}
7727
Guido van Rossum078151d2002-08-11 04:24:12 +00007728/* XXX To save some code duplication, formatfloat/long/int could have been
7729 shared with stringobject.c, converting from 8-bit to Unicode after the
7730 formatting is done. */
7731
Guido van Rossumd57fd912000-03-10 22:53:23 +00007732static int
7733formatfloat(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007734 size_t buflen,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007735 int flags,
7736 int prec,
7737 int type,
7738 PyObject *v)
7739{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007740 /* fmt = '%#.' + `prec` + `type`
7741 worst case length = 3 + 10 (len of INT_MAX) + 1 = 14 (use 20)*/
Guido van Rossumd57fd912000-03-10 22:53:23 +00007742 char fmt[20];
7743 double x;
Tim Petersced69f82003-09-16 20:30:58 +00007744
Guido van Rossumd57fd912000-03-10 22:53:23 +00007745 x = PyFloat_AsDouble(v);
7746 if (x == -1.0 && PyErr_Occurred())
7747 return -1;
7748 if (prec < 0)
7749 prec = 6;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007750 if (type == 'f' && (fabs(x) / 1e25) >= 1e25)
7751 type = 'g';
Marc-André Lemburg79f57832002-12-29 19:44:06 +00007752 /* Worst case length calc to ensure no buffer overrun:
7753
7754 'g' formats:
7755 fmt = %#.<prec>g
7756 buf = '-' + [0-9]*prec + '.' + 'e+' + (longest exp
7757 for any double rep.)
7758 len = 1 + prec + 1 + 2 + 5 = 9 + prec
7759
7760 'f' formats:
7761 buf = '-' + [0-9]*x + '.' + [0-9]*prec (with x < 50)
7762 len = 1 + 50 + 1 + prec = 52 + prec
7763
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007764 If prec=0 the effective precision is 1 (the leading digit is
Tim Petersced69f82003-09-16 20:30:58 +00007765 always given), therefore increase the length by one.
Marc-André Lemburg79f57832002-12-29 19:44:06 +00007766
7767 */
7768 if ((type == 'g' && buflen <= (size_t)10 + (size_t)prec) ||
7769 (type == 'f' && buflen <= (size_t)53 + (size_t)prec)) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007770 PyErr_SetString(PyExc_OverflowError,
Marc-André Lemburg79f57832002-12-29 19:44:06 +00007771 "formatted float is too long (precision too large?)");
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007772 return -1;
7773 }
Marc-André Lemburg79f57832002-12-29 19:44:06 +00007774 PyOS_snprintf(fmt, sizeof(fmt), "%%%s.%d%c",
7775 (flags&F_ALT) ? "#" : "",
7776 prec, type);
Neal Norwitzfc76d632006-01-10 06:03:13 +00007777 return doubletounicode(buf, buflen, fmt, x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007778}
7779
Tim Peters38fd5b62000-09-21 05:43:11 +00007780static PyObject*
7781formatlong(PyObject *val, int flags, int prec, int type)
7782{
7783 char *buf;
7784 int i, len;
7785 PyObject *str; /* temporary string object. */
7786 PyUnicodeObject *result;
7787
7788 str = _PyString_FormatLong(val, flags, prec, type, &buf, &len);
7789 if (!str)
7790 return NULL;
7791 result = _PyUnicode_New(len);
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00007792 if (!result) {
7793 Py_DECREF(str);
7794 return NULL;
7795 }
Tim Peters38fd5b62000-09-21 05:43:11 +00007796 for (i = 0; i < len; i++)
7797 result->str[i] = buf[i];
7798 result->str[len] = 0;
7799 Py_DECREF(str);
7800 return (PyObject*)result;
7801}
7802
Guido van Rossumd57fd912000-03-10 22:53:23 +00007803static int
7804formatint(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007805 size_t buflen,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007806 int flags,
7807 int prec,
7808 int type,
7809 PyObject *v)
7810{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007811 /* fmt = '%#.' + `prec` + 'l' + `type`
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00007812 * worst case length = 3 + 19 (worst len of INT_MAX on 64-bit machine)
7813 * + 1 + 1
7814 * = 24
7815 */
Tim Peters38fd5b62000-09-21 05:43:11 +00007816 char fmt[64]; /* plenty big enough! */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00007817 char *sign;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007818 long x;
7819
7820 x = PyInt_AsLong(v);
7821 if (x == -1 && PyErr_Occurred())
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00007822 return -1;
Guido van Rossum6c9e1302003-11-29 23:52:13 +00007823 if (x < 0 && type == 'u') {
7824 type = 'd';
Guido van Rossum078151d2002-08-11 04:24:12 +00007825 }
Guido van Rossum6c9e1302003-11-29 23:52:13 +00007826 if (x < 0 && (type == 'x' || type == 'X' || type == 'o'))
7827 sign = "-";
7828 else
7829 sign = "";
Guido van Rossumd57fd912000-03-10 22:53:23 +00007830 if (prec < 0)
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00007831 prec = 1;
7832
Guido van Rossum6c9e1302003-11-29 23:52:13 +00007833 /* buf = '+'/'-'/'' + '0'/'0x'/'' + '[0-9]'*max(prec, len(x in octal))
7834 * worst case buf = '-0x' + [0-9]*prec, where prec >= 11
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00007835 */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00007836 if (buflen <= 14 || buflen <= (size_t)3 + (size_t)prec) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007837 PyErr_SetString(PyExc_OverflowError,
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00007838 "formatted integer is too long (precision too large?)");
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007839 return -1;
7840 }
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00007841
7842 if ((flags & F_ALT) &&
7843 (type == 'x' || type == 'X')) {
Tim Petersced69f82003-09-16 20:30:58 +00007844 /* When converting under %#x or %#X, there are a number
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00007845 * of issues that cause pain:
7846 * - when 0 is being converted, the C standard leaves off
7847 * the '0x' or '0X', which is inconsistent with other
7848 * %#x/%#X conversions and inconsistent with Python's
7849 * hex() function
7850 * - there are platforms that violate the standard and
7851 * convert 0 with the '0x' or '0X'
7852 * (Metrowerks, Compaq Tru64)
7853 * - there are platforms that give '0x' when converting
Tim Petersced69f82003-09-16 20:30:58 +00007854 * under %#X, but convert 0 in accordance with the
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00007855 * standard (OS/2 EMX)
Tim Petersced69f82003-09-16 20:30:58 +00007856 *
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00007857 * We can achieve the desired consistency by inserting our
7858 * own '0x' or '0X' prefix, and substituting %x/%X in place
7859 * of %#x/%#X.
7860 *
7861 * Note that this is the same approach as used in
7862 * formatint() in stringobject.c
Andrew MacIntyrec4874392002-02-26 11:36:35 +00007863 */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00007864 PyOS_snprintf(fmt, sizeof(fmt), "%s0%c%%.%dl%c",
7865 sign, type, prec, type);
Andrew MacIntyrec4874392002-02-26 11:36:35 +00007866 }
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00007867 else {
Guido van Rossum6c9e1302003-11-29 23:52:13 +00007868 PyOS_snprintf(fmt, sizeof(fmt), "%s%%%s.%dl%c",
7869 sign, (flags&F_ALT) ? "#" : "",
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00007870 prec, type);
Tim Petersb3d8d1f2001-04-28 05:38:26 +00007871 }
Guido van Rossum6c9e1302003-11-29 23:52:13 +00007872 if (sign[0])
Neal Norwitzfc76d632006-01-10 06:03:13 +00007873 return longtounicode(buf, buflen, fmt, -x);
Guido van Rossum6c9e1302003-11-29 23:52:13 +00007874 else
Neal Norwitzfc76d632006-01-10 06:03:13 +00007875 return longtounicode(buf, buflen, fmt, x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007876}
7877
7878static int
7879formatchar(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007880 size_t buflen,
7881 PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007882{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007883 /* presume that the buffer is at least 2 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00007884 if (PyUnicode_Check(v)) {
7885 if (PyUnicode_GET_SIZE(v) != 1)
7886 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007887 buf[0] = PyUnicode_AS_UNICODE(v)[0];
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00007888 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007889
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00007890 else if (PyString_Check(v)) {
Tim Petersced69f82003-09-16 20:30:58 +00007891 if (PyString_GET_SIZE(v) != 1)
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00007892 goto onError;
7893 buf[0] = (Py_UNICODE)PyString_AS_STRING(v)[0];
7894 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007895
7896 else {
7897 /* Integer input truncated to a character */
7898 long x;
7899 x = PyInt_AsLong(v);
7900 if (x == -1 && PyErr_Occurred())
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00007901 goto onError;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00007902#ifdef Py_UNICODE_WIDE
7903 if (x < 0 || x > 0x10ffff) {
Walter Dörwald44f527f2003-04-02 16:37:24 +00007904 PyErr_SetString(PyExc_OverflowError,
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00007905 "%c arg not in range(0x110000) "
7906 "(wide Python build)");
7907 return -1;
7908 }
7909#else
7910 if (x < 0 || x > 0xffff) {
Walter Dörwald44f527f2003-04-02 16:37:24 +00007911 PyErr_SetString(PyExc_OverflowError,
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00007912 "%c arg not in range(0x10000) "
7913 "(narrow Python build)");
7914 return -1;
7915 }
7916#endif
7917 buf[0] = (Py_UNICODE) x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007918 }
7919 buf[1] = '\0';
7920 return 1;
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00007921
7922 onError:
7923 PyErr_SetString(PyExc_TypeError,
7924 "%c requires int or char");
7925 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007926}
7927
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007928/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
7929
7930 FORMATBUFLEN is the length of the buffer in which the floats, ints, &
7931 chars are formatted. XXX This is a magic number. Each formatting
7932 routine does bounds checking to ensure no overflow, but a better
7933 solution may be to malloc a buffer of appropriate size for each
7934 format. For now, the current solution is sufficient.
7935*/
7936#define FORMATBUFLEN (size_t)120
7937
Guido van Rossumd57fd912000-03-10 22:53:23 +00007938PyObject *PyUnicode_Format(PyObject *format,
7939 PyObject *args)
7940{
7941 Py_UNICODE *fmt, *res;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007942 Py_ssize_t fmtcnt, rescnt, reslen, arglen, argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007943 int args_owned = 0;
7944 PyUnicodeObject *result = NULL;
7945 PyObject *dict = NULL;
7946 PyObject *uformat;
Tim Petersced69f82003-09-16 20:30:58 +00007947
Guido van Rossumd57fd912000-03-10 22:53:23 +00007948 if (format == NULL || args == NULL) {
7949 PyErr_BadInternalCall();
7950 return NULL;
7951 }
7952 uformat = PyUnicode_FromObject(format);
Fred Drakee4315f52000-05-09 19:53:39 +00007953 if (uformat == NULL)
7954 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007955 fmt = PyUnicode_AS_UNICODE(uformat);
7956 fmtcnt = PyUnicode_GET_SIZE(uformat);
7957
7958 reslen = rescnt = fmtcnt + 100;
7959 result = _PyUnicode_New(reslen);
7960 if (result == NULL)
7961 goto onError;
7962 res = PyUnicode_AS_UNICODE(result);
7963
7964 if (PyTuple_Check(args)) {
7965 arglen = PyTuple_Size(args);
7966 argidx = 0;
7967 }
7968 else {
7969 arglen = -1;
7970 argidx = -2;
7971 }
Neal Norwitz80a1bf42002-11-12 23:01:12 +00007972 if (args->ob_type->tp_as_mapping && !PyTuple_Check(args) &&
7973 !PyObject_TypeCheck(args, &PyBaseString_Type))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007974 dict = args;
7975
7976 while (--fmtcnt >= 0) {
7977 if (*fmt != '%') {
7978 if (--rescnt < 0) {
7979 rescnt = fmtcnt + 100;
7980 reslen += rescnt;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00007981 if (_PyUnicode_Resize(&result, reslen) < 0)
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00007982 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007983 res = PyUnicode_AS_UNICODE(result) + reslen - rescnt;
7984 --rescnt;
7985 }
7986 *res++ = *fmt++;
7987 }
7988 else {
7989 /* Got a format specifier */
7990 int flags = 0;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007991 Py_ssize_t width = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007992 int prec = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007993 Py_UNICODE c = '\0';
7994 Py_UNICODE fill;
7995 PyObject *v = NULL;
7996 PyObject *temp = NULL;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007997 Py_UNICODE *pbuf;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007998 Py_UNICODE sign;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007999 Py_ssize_t len;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008000 Py_UNICODE formatbuf[FORMATBUFLEN]; /* For format{float,int,char}() */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008001
8002 fmt++;
8003 if (*fmt == '(') {
8004 Py_UNICODE *keystart;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008005 Py_ssize_t keylen;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008006 PyObject *key;
8007 int pcount = 1;
8008
8009 if (dict == NULL) {
8010 PyErr_SetString(PyExc_TypeError,
Tim Petersced69f82003-09-16 20:30:58 +00008011 "format requires a mapping");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008012 goto onError;
8013 }
8014 ++fmt;
8015 --fmtcnt;
8016 keystart = fmt;
8017 /* Skip over balanced parentheses */
8018 while (pcount > 0 && --fmtcnt >= 0) {
8019 if (*fmt == ')')
8020 --pcount;
8021 else if (*fmt == '(')
8022 ++pcount;
8023 fmt++;
8024 }
8025 keylen = fmt - keystart - 1;
8026 if (fmtcnt < 0 || pcount > 0) {
8027 PyErr_SetString(PyExc_ValueError,
8028 "incomplete format key");
8029 goto onError;
8030 }
Marc-André Lemburg72f82132001-11-20 15:18:49 +00008031#if 0
Fred Drakee4315f52000-05-09 19:53:39 +00008032 /* keys are converted to strings using UTF-8 and
Guido van Rossumd57fd912000-03-10 22:53:23 +00008033 then looked up since Python uses strings to hold
8034 variables names etc. in its namespaces and we
Fred Drakee4315f52000-05-09 19:53:39 +00008035 wouldn't want to break common idioms. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008036 key = PyUnicode_EncodeUTF8(keystart,
8037 keylen,
8038 NULL);
Marc-André Lemburg72f82132001-11-20 15:18:49 +00008039#else
8040 key = PyUnicode_FromUnicode(keystart, keylen);
8041#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00008042 if (key == NULL)
8043 goto onError;
8044 if (args_owned) {
8045 Py_DECREF(args);
8046 args_owned = 0;
8047 }
8048 args = PyObject_GetItem(dict, key);
8049 Py_DECREF(key);
8050 if (args == NULL) {
8051 goto onError;
8052 }
8053 args_owned = 1;
8054 arglen = -1;
8055 argidx = -2;
8056 }
8057 while (--fmtcnt >= 0) {
8058 switch (c = *fmt++) {
8059 case '-': flags |= F_LJUST; continue;
8060 case '+': flags |= F_SIGN; continue;
8061 case ' ': flags |= F_BLANK; continue;
8062 case '#': flags |= F_ALT; continue;
8063 case '0': flags |= F_ZERO; continue;
8064 }
8065 break;
8066 }
8067 if (c == '*') {
8068 v = getnextarg(args, arglen, &argidx);
8069 if (v == NULL)
8070 goto onError;
8071 if (!PyInt_Check(v)) {
8072 PyErr_SetString(PyExc_TypeError,
8073 "* wants int");
8074 goto onError;
8075 }
8076 width = PyInt_AsLong(v);
Guido van Rossumddefaf32007-01-14 03:31:43 +00008077 if (width == -1 && PyErr_Occurred())
8078 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008079 if (width < 0) {
8080 flags |= F_LJUST;
8081 width = -width;
8082 }
8083 if (--fmtcnt >= 0)
8084 c = *fmt++;
8085 }
8086 else if (c >= '0' && c <= '9') {
8087 width = c - '0';
8088 while (--fmtcnt >= 0) {
8089 c = *fmt++;
8090 if (c < '0' || c > '9')
8091 break;
8092 if ((width*10) / 10 != width) {
8093 PyErr_SetString(PyExc_ValueError,
8094 "width too big");
8095 goto onError;
8096 }
8097 width = width*10 + (c - '0');
8098 }
8099 }
8100 if (c == '.') {
8101 prec = 0;
8102 if (--fmtcnt >= 0)
8103 c = *fmt++;
8104 if (c == '*') {
8105 v = getnextarg(args, arglen, &argidx);
8106 if (v == NULL)
8107 goto onError;
8108 if (!PyInt_Check(v)) {
8109 PyErr_SetString(PyExc_TypeError,
8110 "* wants int");
8111 goto onError;
8112 }
8113 prec = PyInt_AsLong(v);
Guido van Rossumddefaf32007-01-14 03:31:43 +00008114 if (prec == -1 && PyErr_Occurred())
8115 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008116 if (prec < 0)
8117 prec = 0;
8118 if (--fmtcnt >= 0)
8119 c = *fmt++;
8120 }
8121 else if (c >= '0' && c <= '9') {
8122 prec = c - '0';
8123 while (--fmtcnt >= 0) {
8124 c = Py_CHARMASK(*fmt++);
8125 if (c < '0' || c > '9')
8126 break;
8127 if ((prec*10) / 10 != prec) {
8128 PyErr_SetString(PyExc_ValueError,
8129 "prec too big");
8130 goto onError;
8131 }
8132 prec = prec*10 + (c - '0');
8133 }
8134 }
8135 } /* prec */
8136 if (fmtcnt >= 0) {
8137 if (c == 'h' || c == 'l' || c == 'L') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00008138 if (--fmtcnt >= 0)
8139 c = *fmt++;
8140 }
8141 }
8142 if (fmtcnt < 0) {
8143 PyErr_SetString(PyExc_ValueError,
8144 "incomplete format");
8145 goto onError;
8146 }
8147 if (c != '%') {
8148 v = getnextarg(args, arglen, &argidx);
8149 if (v == NULL)
8150 goto onError;
8151 }
8152 sign = 0;
8153 fill = ' ';
8154 switch (c) {
8155
8156 case '%':
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008157 pbuf = formatbuf;
8158 /* presume that buffer length is at least 1 */
8159 pbuf[0] = '%';
Guido van Rossumd57fd912000-03-10 22:53:23 +00008160 len = 1;
8161 break;
8162
8163 case 's':
8164 case 'r':
8165 if (PyUnicode_Check(v) && c == 's') {
8166 temp = v;
8167 Py_INCREF(temp);
8168 }
8169 else {
8170 PyObject *unicode;
8171 if (c == 's')
Marc-André Lemburgd25c6502004-07-23 16:13:25 +00008172 temp = PyObject_Unicode(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008173 else
8174 temp = PyObject_Repr(v);
8175 if (temp == NULL)
8176 goto onError;
Marc-André Lemburgd25c6502004-07-23 16:13:25 +00008177 if (PyUnicode_Check(temp))
8178 /* nothing to do */;
8179 else if (PyString_Check(temp)) {
8180 /* convert to string to Unicode */
Thomas Woutersa96affe2006-03-12 00:29:36 +00008181 unicode = PyUnicode_Decode(PyString_AS_STRING(temp),
Guido van Rossumd57fd912000-03-10 22:53:23 +00008182 PyString_GET_SIZE(temp),
Thomas Woutersa96affe2006-03-12 00:29:36 +00008183 NULL,
Guido van Rossumd57fd912000-03-10 22:53:23 +00008184 "strict");
Thomas Woutersa96affe2006-03-12 00:29:36 +00008185 Py_DECREF(temp);
8186 temp = unicode;
8187 if (temp == NULL)
8188 goto onError;
8189 }
Marc-André Lemburgd25c6502004-07-23 16:13:25 +00008190 else {
8191 Py_DECREF(temp);
8192 PyErr_SetString(PyExc_TypeError,
8193 "%s argument has non-string str()");
8194 goto onError;
8195 }
8196 }
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008197 pbuf = PyUnicode_AS_UNICODE(temp);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008198 len = PyUnicode_GET_SIZE(temp);
8199 if (prec >= 0 && len > prec)
8200 len = prec;
8201 break;
8202
8203 case 'i':
8204 case 'd':
8205 case 'u':
8206 case 'o':
8207 case 'x':
8208 case 'X':
8209 if (c == 'i')
8210 c = 'd';
Tim Petersa3a3a032000-11-30 05:22:44 +00008211 if (PyLong_Check(v)) {
Tim Peters38fd5b62000-09-21 05:43:11 +00008212 temp = formatlong(v, flags, prec, c);
8213 if (!temp)
8214 goto onError;
8215 pbuf = PyUnicode_AS_UNICODE(temp);
8216 len = PyUnicode_GET_SIZE(temp);
Tim Peters38fd5b62000-09-21 05:43:11 +00008217 sign = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008218 }
Tim Peters38fd5b62000-09-21 05:43:11 +00008219 else {
8220 pbuf = formatbuf;
8221 len = formatint(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
8222 flags, prec, c, v);
8223 if (len < 0)
8224 goto onError;
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008225 sign = 1;
Tim Peters38fd5b62000-09-21 05:43:11 +00008226 }
8227 if (flags & F_ZERO)
8228 fill = '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00008229 break;
8230
8231 case 'e':
8232 case 'E':
8233 case 'f':
Raymond Hettinger9bfe5332003-08-27 04:55:52 +00008234 case 'F':
Guido van Rossumd57fd912000-03-10 22:53:23 +00008235 case 'g':
8236 case 'G':
Raymond Hettinger9bfe5332003-08-27 04:55:52 +00008237 if (c == 'F')
8238 c = 'f';
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008239 pbuf = formatbuf;
8240 len = formatfloat(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
8241 flags, prec, c, v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008242 if (len < 0)
8243 goto onError;
8244 sign = 1;
Tim Peters38fd5b62000-09-21 05:43:11 +00008245 if (flags & F_ZERO)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008246 fill = '0';
8247 break;
8248
8249 case 'c':
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008250 pbuf = formatbuf;
8251 len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008252 if (len < 0)
8253 goto onError;
8254 break;
8255
8256 default:
8257 PyErr_Format(PyExc_ValueError,
Andrew M. Kuchling6ca89172000-12-15 13:07:46 +00008258 "unsupported format character '%c' (0x%x) "
Thomas Wouters89f507f2006-12-13 04:49:30 +00008259 "at index %zd",
Tim Petersced69f82003-09-16 20:30:58 +00008260 (31<=c && c<=126) ? (char)c : '?',
Marc-André Lemburg24e53b62002-09-24 09:32:14 +00008261 (int)c,
Thomas Wouters89f507f2006-12-13 04:49:30 +00008262 (Py_ssize_t)(fmt - 1 -
8263 PyUnicode_AS_UNICODE(uformat)));
Guido van Rossumd57fd912000-03-10 22:53:23 +00008264 goto onError;
8265 }
8266 if (sign) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008267 if (*pbuf == '-' || *pbuf == '+') {
8268 sign = *pbuf++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008269 len--;
8270 }
8271 else if (flags & F_SIGN)
8272 sign = '+';
8273 else if (flags & F_BLANK)
8274 sign = ' ';
8275 else
8276 sign = 0;
8277 }
8278 if (width < len)
8279 width = len;
Guido van Rossum049cd6b2002-10-11 00:43:48 +00008280 if (rescnt - (sign != 0) < width) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00008281 reslen -= rescnt;
8282 rescnt = width + fmtcnt + 100;
8283 reslen += rescnt;
Guido van Rossum049cd6b2002-10-11 00:43:48 +00008284 if (reslen < 0) {
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00008285 Py_XDECREF(temp);
Thomas Woutersa96affe2006-03-12 00:29:36 +00008286 PyErr_NoMemory();
8287 goto onError;
Guido van Rossum049cd6b2002-10-11 00:43:48 +00008288 }
Thomas Woutersa96affe2006-03-12 00:29:36 +00008289 if (_PyUnicode_Resize(&result, reslen) < 0) {
8290 Py_XDECREF(temp);
8291 goto onError;
8292 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008293 res = PyUnicode_AS_UNICODE(result)
8294 + reslen - rescnt;
8295 }
8296 if (sign) {
8297 if (fill != ' ')
8298 *res++ = sign;
8299 rescnt--;
8300 if (width > len)
8301 width--;
8302 }
Tim Peters38fd5b62000-09-21 05:43:11 +00008303 if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
8304 assert(pbuf[0] == '0');
Tim Petersfff53252001-04-12 18:38:48 +00008305 assert(pbuf[1] == c);
8306 if (fill != ' ') {
8307 *res++ = *pbuf++;
8308 *res++ = *pbuf++;
Tim Peters38fd5b62000-09-21 05:43:11 +00008309 }
Tim Petersfff53252001-04-12 18:38:48 +00008310 rescnt -= 2;
8311 width -= 2;
8312 if (width < 0)
8313 width = 0;
8314 len -= 2;
Tim Peters38fd5b62000-09-21 05:43:11 +00008315 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008316 if (width > len && !(flags & F_LJUST)) {
8317 do {
8318 --rescnt;
8319 *res++ = fill;
8320 } while (--width > len);
8321 }
Tim Peters38fd5b62000-09-21 05:43:11 +00008322 if (fill == ' ') {
8323 if (sign)
8324 *res++ = sign;
Tim Petersfff53252001-04-12 18:38:48 +00008325 if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
Tim Peters38fd5b62000-09-21 05:43:11 +00008326 assert(pbuf[0] == '0');
Tim Petersfff53252001-04-12 18:38:48 +00008327 assert(pbuf[1] == c);
Tim Peters38fd5b62000-09-21 05:43:11 +00008328 *res++ = *pbuf++;
8329 *res++ = *pbuf++;
8330 }
8331 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008332 Py_UNICODE_COPY(res, pbuf, len);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008333 res += len;
8334 rescnt -= len;
8335 while (--width >= len) {
8336 --rescnt;
8337 *res++ = ' ';
8338 }
8339 if (dict && (argidx < arglen) && c != '%') {
8340 PyErr_SetString(PyExc_TypeError,
Raymond Hettinger0ebac972002-05-21 15:14:57 +00008341 "not all arguments converted during string formatting");
Thomas Woutersa96affe2006-03-12 00:29:36 +00008342 Py_XDECREF(temp);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008343 goto onError;
8344 }
8345 Py_XDECREF(temp);
8346 } /* '%' */
8347 } /* until end */
8348 if (argidx < arglen && !dict) {
8349 PyErr_SetString(PyExc_TypeError,
Raymond Hettinger0ebac972002-05-21 15:14:57 +00008350 "not all arguments converted during string formatting");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008351 goto onError;
8352 }
8353
Thomas Woutersa96affe2006-03-12 00:29:36 +00008354 if (_PyUnicode_Resize(&result, reslen - rescnt) < 0)
8355 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008356 if (args_owned) {
8357 Py_DECREF(args);
8358 }
8359 Py_DECREF(uformat);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008360 return (PyObject *)result;
8361
8362 onError:
8363 Py_XDECREF(result);
8364 Py_DECREF(uformat);
8365 if (args_owned) {
8366 Py_DECREF(args);
8367 }
8368 return NULL;
8369}
8370
8371static PyBufferProcs unicode_as_buffer = {
Martin v. Löwis18e16552006-02-15 17:27:45 +00008372 (readbufferproc) unicode_buffer_getreadbuf,
8373 (writebufferproc) unicode_buffer_getwritebuf,
8374 (segcountproc) unicode_buffer_getsegcount,
8375 (charbufferproc) unicode_buffer_getcharbuf,
Guido van Rossumd57fd912000-03-10 22:53:23 +00008376};
8377
Jeremy Hylton938ace62002-07-17 16:30:39 +00008378static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +00008379unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
8380
Tim Peters6d6c1a32001-08-02 04:15:00 +00008381static PyObject *
8382unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
8383{
8384 PyObject *x = NULL;
Martin v. Löwis15e62742006-02-27 16:46:16 +00008385 static char *kwlist[] = {"string", "encoding", "errors", 0};
Tim Peters6d6c1a32001-08-02 04:15:00 +00008386 char *encoding = NULL;
8387 char *errors = NULL;
8388
Guido van Rossume023fe02001-08-30 03:12:59 +00008389 if (type != &PyUnicode_Type)
8390 return unicode_subtype_new(type, args, kwds);
Tim Peters6d6c1a32001-08-02 04:15:00 +00008391 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:unicode",
8392 kwlist, &x, &encoding, &errors))
8393 return NULL;
8394 if (x == NULL)
8395 return (PyObject *)_PyUnicode_New(0);
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00008396 if (encoding == NULL && errors == NULL)
8397 return PyObject_Unicode(x);
8398 else
Tim Peters6d6c1a32001-08-02 04:15:00 +00008399 return PyUnicode_FromEncodedObject(x, encoding, errors);
8400}
8401
Guido van Rossume023fe02001-08-30 03:12:59 +00008402static PyObject *
8403unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
8404{
Tim Petersaf90b3e2001-09-12 05:18:58 +00008405 PyUnicodeObject *tmp, *pnew;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008406 Py_ssize_t n;
Guido van Rossume023fe02001-08-30 03:12:59 +00008407
8408 assert(PyType_IsSubtype(type, &PyUnicode_Type));
8409 tmp = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds);
8410 if (tmp == NULL)
8411 return NULL;
8412 assert(PyUnicode_Check(tmp));
Tim Petersaf90b3e2001-09-12 05:18:58 +00008413 pnew = (PyUnicodeObject *) type->tp_alloc(type, n = tmp->length);
Raymond Hettingerf4667932003-06-28 20:04:25 +00008414 if (pnew == NULL) {
8415 Py_DECREF(tmp);
Guido van Rossume023fe02001-08-30 03:12:59 +00008416 return NULL;
Raymond Hettingerf4667932003-06-28 20:04:25 +00008417 }
Tim Petersaf90b3e2001-09-12 05:18:58 +00008418 pnew->str = PyMem_NEW(Py_UNICODE, n+1);
8419 if (pnew->str == NULL) {
8420 _Py_ForgetReference((PyObject *)pnew);
Neil Schemenauer58aa8612002-04-12 03:07:20 +00008421 PyObject_Del(pnew);
Raymond Hettingerf4667932003-06-28 20:04:25 +00008422 Py_DECREF(tmp);
Neal Norwitzec74f2f2003-02-11 23:05:40 +00008423 return PyErr_NoMemory();
Guido van Rossume023fe02001-08-30 03:12:59 +00008424 }
Tim Petersaf90b3e2001-09-12 05:18:58 +00008425 Py_UNICODE_COPY(pnew->str, tmp->str, n+1);
8426 pnew->length = n;
8427 pnew->hash = tmp->hash;
Guido van Rossume023fe02001-08-30 03:12:59 +00008428 Py_DECREF(tmp);
Tim Petersaf90b3e2001-09-12 05:18:58 +00008429 return (PyObject *)pnew;
Guido van Rossume023fe02001-08-30 03:12:59 +00008430}
8431
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008432PyDoc_STRVAR(unicode_doc,
Tim Peters6d6c1a32001-08-02 04:15:00 +00008433"unicode(string [, encoding[, errors]]) -> object\n\
8434\n\
8435Create a new Unicode object from the given encoded string.\n\
Skip Montanaro35b37a52002-07-26 16:22:46 +00008436encoding defaults to the current default string encoding.\n\
8437errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +00008438
Guido van Rossum50e9fb92006-08-17 05:42:55 +00008439static PyObject *unicode_iter(PyObject *seq);
8440
Guido van Rossumd57fd912000-03-10 22:53:23 +00008441PyTypeObject PyUnicode_Type = {
8442 PyObject_HEAD_INIT(&PyType_Type)
8443 0, /* ob_size */
Guido van Rossum84fc66d2007-05-03 17:18:26 +00008444 "str", /* tp_name */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008445 sizeof(PyUnicodeObject), /* tp_size */
8446 0, /* tp_itemsize */
8447 /* Slots */
Guido van Rossum9475a232001-10-05 20:51:39 +00008448 (destructor)unicode_dealloc, /* tp_dealloc */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008449 0, /* tp_print */
Tim Peters6d6c1a32001-08-02 04:15:00 +00008450 0, /* tp_getattr */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008451 0, /* tp_setattr */
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00008452 0, /* tp_compare */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008453 unicode_repr, /* tp_repr */
Neil Schemenauerce30bc92002-11-18 16:10:18 +00008454 &unicode_as_number, /* tp_as_number */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008455 &unicode_as_sequence, /* tp_as_sequence */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008456 &unicode_as_mapping, /* tp_as_mapping */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008457 (hashfunc) unicode_hash, /* tp_hash*/
8458 0, /* tp_call*/
8459 (reprfunc) unicode_str, /* tp_str */
Tim Peters6d6c1a32001-08-02 04:15:00 +00008460 PyObject_GenericGetAttr, /* tp_getattro */
8461 0, /* tp_setattro */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008462 &unicode_as_buffer, /* tp_as_buffer */
Thomas Wouters27d517b2007-02-25 20:39:11 +00008463 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
8464 Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
Tim Peters6d6c1a32001-08-02 04:15:00 +00008465 unicode_doc, /* tp_doc */
8466 0, /* tp_traverse */
8467 0, /* tp_clear */
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00008468 PyUnicode_RichCompare, /* tp_richcompare */
Tim Peters6d6c1a32001-08-02 04:15:00 +00008469 0, /* tp_weaklistoffset */
Guido van Rossum50e9fb92006-08-17 05:42:55 +00008470 unicode_iter, /* tp_iter */
Tim Peters6d6c1a32001-08-02 04:15:00 +00008471 0, /* tp_iternext */
8472 unicode_methods, /* tp_methods */
8473 0, /* tp_members */
8474 0, /* tp_getset */
Guido van Rossumcacfc072002-05-24 19:01:59 +00008475 &PyBaseString_Type, /* tp_base */
Tim Peters6d6c1a32001-08-02 04:15:00 +00008476 0, /* tp_dict */
8477 0, /* tp_descr_get */
8478 0, /* tp_descr_set */
8479 0, /* tp_dictoffset */
8480 0, /* tp_init */
8481 0, /* tp_alloc */
8482 unicode_new, /* tp_new */
Neil Schemenauer58aa8612002-04-12 03:07:20 +00008483 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008484};
8485
8486/* Initialize the Unicode implementation */
8487
Thomas Wouters78890102000-07-22 19:25:51 +00008488void _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008489{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008490 int i;
8491
Thomas Wouters477c8d52006-05-27 19:21:47 +00008492 /* XXX - move this array to unicodectype.c ? */
8493 Py_UNICODE linebreak[] = {
8494 0x000A, /* LINE FEED */
8495 0x000D, /* CARRIAGE RETURN */
8496 0x001C, /* FILE SEPARATOR */
8497 0x001D, /* GROUP SEPARATOR */
8498 0x001E, /* RECORD SEPARATOR */
8499 0x0085, /* NEXT LINE */
8500 0x2028, /* LINE SEPARATOR */
8501 0x2029, /* PARAGRAPH SEPARATOR */
8502 };
8503
Fred Drakee4315f52000-05-09 19:53:39 +00008504 /* Init the implementation */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008505 unicode_freelist = NULL;
8506 unicode_freelist_size = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008507 unicode_empty = _PyUnicode_New(0);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008508 if (!unicode_empty)
8509 return;
8510
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008511 for (i = 0; i < 256; i++)
8512 unicode_latin1[i] = NULL;
Guido van Rossumcacfc072002-05-24 19:01:59 +00008513 if (PyType_Ready(&PyUnicode_Type) < 0)
8514 Py_FatalError("Can't initialize 'unicode'");
Thomas Wouters477c8d52006-05-27 19:21:47 +00008515
8516 /* initialize the linebreak bloom filter */
8517 bloom_linebreak = make_bloom_mask(
8518 linebreak, sizeof(linebreak) / sizeof(linebreak[0])
8519 );
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008520
8521 PyType_Ready(&EncodingMapType);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008522}
8523
8524/* Finalize the Unicode implementation */
8525
8526void
Thomas Wouters78890102000-07-22 19:25:51 +00008527_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008528{
Barry Warsaw5b4c2282000-10-03 20:45:26 +00008529 PyUnicodeObject *u;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008530 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008531
Guido van Rossum4ae8ef82000-10-03 18:09:04 +00008532 Py_XDECREF(unicode_empty);
8533 unicode_empty = NULL;
Barry Warsaw5b4c2282000-10-03 20:45:26 +00008534
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008535 for (i = 0; i < 256; i++) {
8536 if (unicode_latin1[i]) {
8537 Py_DECREF(unicode_latin1[i]);
8538 unicode_latin1[i] = NULL;
8539 }
8540 }
8541
Barry Warsaw5b4c2282000-10-03 20:45:26 +00008542 for (u = unicode_freelist; u != NULL;) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00008543 PyUnicodeObject *v = u;
8544 u = *(PyUnicodeObject **)u;
Guido van Rossumfd4b9572000-04-10 13:51:10 +00008545 if (v->str)
Guido van Rossumb18618d2000-05-03 23:44:39 +00008546 PyMem_DEL(v->str);
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00008547 Py_XDECREF(v->defenc);
Neil Schemenauer58aa8612002-04-12 03:07:20 +00008548 PyObject_Del(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008549 }
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008550 unicode_freelist = NULL;
8551 unicode_freelist_size = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008552}
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00008553
Guido van Rossum50e9fb92006-08-17 05:42:55 +00008554
8555
8556/********************* Unicode Iterator **************************/
8557
8558typedef struct {
8559 PyObject_HEAD
Guido van Rossum49d6b072006-08-17 21:11:47 +00008560 Py_ssize_t it_index;
Guido van Rossum50e9fb92006-08-17 05:42:55 +00008561 PyUnicodeObject *it_seq; /* Set to NULL when iterator is exhausted */
8562} unicodeiterobject;
8563
8564static void
8565unicodeiter_dealloc(unicodeiterobject *it)
8566{
8567 _PyObject_GC_UNTRACK(it);
8568 Py_XDECREF(it->it_seq);
8569 PyObject_GC_Del(it);
8570}
8571
8572static int
8573unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
8574{
8575 Py_VISIT(it->it_seq);
8576 return 0;
8577}
8578
8579static PyObject *
8580unicodeiter_next(unicodeiterobject *it)
8581{
8582 PyUnicodeObject *seq;
8583 PyObject *item;
8584
8585 assert(it != NULL);
8586 seq = it->it_seq;
8587 if (seq == NULL)
8588 return NULL;
8589 assert(PyUnicode_Check(seq));
8590
8591 if (it->it_index < PyUnicode_GET_SIZE(seq)) {
Guido van Rossum49d6b072006-08-17 21:11:47 +00008592 item = PyUnicode_FromUnicode(
8593 PyUnicode_AS_UNICODE(seq)+it->it_index, 1);
Guido van Rossum50e9fb92006-08-17 05:42:55 +00008594 if (item != NULL)
8595 ++it->it_index;
8596 return item;
8597 }
8598
8599 Py_DECREF(seq);
8600 it->it_seq = NULL;
8601 return NULL;
8602}
8603
8604static PyObject *
8605unicodeiter_len(unicodeiterobject *it)
8606{
8607 Py_ssize_t len = 0;
8608 if (it->it_seq)
8609 len = PyUnicode_GET_SIZE(it->it_seq) - it->it_index;
8610 return PyInt_FromSsize_t(len);
8611}
8612
8613PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
8614
8615static PyMethodDef unicodeiter_methods[] = {
Guido van Rossum49d6b072006-08-17 21:11:47 +00008616 {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
8617 length_hint_doc},
Guido van Rossum50e9fb92006-08-17 05:42:55 +00008618 {NULL, NULL} /* sentinel */
8619};
8620
8621PyTypeObject PyUnicodeIter_Type = {
8622 PyObject_HEAD_INIT(&PyType_Type)
8623 0, /* ob_size */
8624 "unicodeiterator", /* tp_name */
8625 sizeof(unicodeiterobject), /* tp_basicsize */
8626 0, /* tp_itemsize */
8627 /* methods */
Guido van Rossum49d6b072006-08-17 21:11:47 +00008628 (destructor)unicodeiter_dealloc, /* tp_dealloc */
Guido van Rossum50e9fb92006-08-17 05:42:55 +00008629 0, /* tp_print */
8630 0, /* tp_getattr */
8631 0, /* tp_setattr */
8632 0, /* tp_compare */
8633 0, /* tp_repr */
8634 0, /* tp_as_number */
8635 0, /* tp_as_sequence */
8636 0, /* tp_as_mapping */
8637 0, /* tp_hash */
8638 0, /* tp_call */
8639 0, /* tp_str */
8640 PyObject_GenericGetAttr, /* tp_getattro */
8641 0, /* tp_setattro */
8642 0, /* tp_as_buffer */
8643 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
8644 0, /* tp_doc */
8645 (traverseproc)unicodeiter_traverse, /* tp_traverse */
8646 0, /* tp_clear */
8647 0, /* tp_richcompare */
8648 0, /* tp_weaklistoffset */
8649 PyObject_SelfIter, /* tp_iter */
8650 (iternextfunc)unicodeiter_next, /* tp_iternext */
8651 unicodeiter_methods, /* tp_methods */
8652 0,
8653};
8654
8655static PyObject *
8656unicode_iter(PyObject *seq)
8657{
8658 unicodeiterobject *it;
8659
8660 if (!PyUnicode_Check(seq)) {
8661 PyErr_BadInternalCall();
8662 return NULL;
8663 }
8664 it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
8665 if (it == NULL)
8666 return NULL;
8667 it->it_index = 0;
8668 Py_INCREF(seq);
8669 it->it_seq = (PyUnicodeObject *)seq;
8670 _PyObject_GC_TRACK(it);
8671 return (PyObject *)it;
8672}
8673
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008674#ifdef __cplusplus
8675}
8676#endif
8677
8678
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00008679/*
8680Local variables:
8681c-basic-offset: 4
8682indent-tabs-mode: nil
8683End:
8684*/