blob: 2fec095f1f46b1c0a82910a83e20d6cff20b199c [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Fred Drake785d14f2000-05-09 19:54:43 +00004modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
Guido van Rossumd57fd912000-03-10 22:53:23 +00005Unicode Integration Proposal (see file Misc/unicode.txt).
6
Thomas Wouters477c8d52006-05-27 19:21:47 +00007Major speed upgrades to the method implementations at the Reykjavik
8NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
9
Guido van Rossum16b1ad92000-08-03 16:24:25 +000010Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000011
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000012--------------------------------------------------------------------
13The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000014
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000015 Copyright (c) 1999 by Secret Labs AB
16 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000017
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000018By obtaining, using, and/or copying this software and/or its
19associated documentation, you agree that you have read, understood,
20and will comply with the following terms and conditions:
21
22Permission to use, copy, modify, and distribute this software and its
23associated documentation for any purpose and without fee is hereby
24granted, provided that the above copyright notice appears in all
25copies, and that both that copyright notice and this permission notice
26appear in supporting documentation, and that the name of Secret Labs
27AB or the author not be used in advertising or publicity pertaining to
28distribution of the software without specific, written prior
29permission.
30
31SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
32THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
33FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
34ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
35WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
36ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
37OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
38--------------------------------------------------------------------
39
40*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000041
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000042#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000043#include "Python.h"
44
Guido van Rossumd57fd912000-03-10 22:53:23 +000045#include "unicodeobject.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000046#include "ucnhash.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000047
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000048#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000049#include <windows.h>
50#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000051
Guido van Rossumd57fd912000-03-10 22:53:23 +000052/* Limit for the Unicode object free list */
53
54#define MAX_UNICODE_FREELIST_SIZE 1024
55
56/* Limit for the Unicode object free list stay alive optimization.
57
58 The implementation will keep allocated Unicode memory intact for
59 all objects on the free list having a size less than this
Tim Petersced69f82003-09-16 20:30:58 +000060 limit. This reduces malloc() overhead for small Unicode objects.
Guido van Rossumd57fd912000-03-10 22:53:23 +000061
Barry Warsaw51ac5802000-03-20 16:36:48 +000062 At worst this will result in MAX_UNICODE_FREELIST_SIZE *
Guido van Rossumfd4b9572000-04-10 13:51:10 +000063 (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
Guido van Rossumd57fd912000-03-10 22:53:23 +000064 malloc()-overhead) bytes of unused garbage.
65
66 Setting the limit to 0 effectively turns the feature off.
67
Guido van Rossumfd4b9572000-04-10 13:51:10 +000068 Note: This is an experimental feature ! If you get core dumps when
69 using Unicode objects, turn this feature off.
Guido van Rossumd57fd912000-03-10 22:53:23 +000070
71*/
72
Guido van Rossumfd4b9572000-04-10 13:51:10 +000073#define KEEPALIVE_SIZE_LIMIT 9
Guido van Rossumd57fd912000-03-10 22:53:23 +000074
75/* Endianness switches; defaults to little endian */
76
77#ifdef WORDS_BIGENDIAN
78# define BYTEORDER_IS_BIG_ENDIAN
79#else
80# define BYTEORDER_IS_LITTLE_ENDIAN
81#endif
82
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000083/* --- Globals ------------------------------------------------------------
84
85 The globals are initialized by the _PyUnicode_Init() API and should
86 not be used before calling that API.
87
88*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000089
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000090
91#ifdef __cplusplus
92extern "C" {
93#endif
94
Guido van Rossumd57fd912000-03-10 22:53:23 +000095/* Free list for Unicode objects */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000096static PyUnicodeObject *unicode_freelist;
97static int unicode_freelist_size;
Guido van Rossumd57fd912000-03-10 22:53:23 +000098
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000099/* The empty Unicode object is shared to improve performance. */
100static PyUnicodeObject *unicode_empty;
101
102/* Single character Unicode strings in the Latin-1 range are being
103 shared as well. */
104static PyUnicodeObject *unicode_latin1[256];
105
Fred Drakee4315f52000-05-09 19:53:39 +0000106/* Default encoding to use and assume when NULL is passed as encoding
Guido van Rossumf15a29f2007-05-04 00:41:39 +0000107 parameter; it is fixed to "utf-8". Always use the
108 PyUnicode_GetDefaultEncoding() API to access this global. */
109static const char unicode_default_encoding[] = "utf-8";
Fred Drakee4315f52000-05-09 19:53:39 +0000110
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000111Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000112PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000113{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000114#ifdef Py_UNICODE_WIDE
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000115 return 0x10FFFF;
116#else
117 /* This is actually an illegal character, so it should
118 not be passed to unichr. */
119 return 0xFFFF;
120#endif
121}
122
Thomas Wouters477c8d52006-05-27 19:21:47 +0000123/* --- Bloom Filters ----------------------------------------------------- */
124
125/* stuff to implement simple "bloom filters" for Unicode characters.
126 to keep things simple, we use a single bitmask, using the least 5
127 bits from each unicode characters as the bit index. */
128
129/* the linebreak mask is set up by Unicode_Init below */
130
131#define BLOOM_MASK unsigned long
132
133static BLOOM_MASK bloom_linebreak;
134
135#define BLOOM(mask, ch) ((mask & (1 << ((ch) & 0x1F))))
136
137#define BLOOM_LINEBREAK(ch)\
138 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK((ch)))
139
140Py_LOCAL_INLINE(BLOOM_MASK) make_bloom_mask(Py_UNICODE* ptr, Py_ssize_t len)
141{
142 /* calculate simple bloom-style bitmask for a given unicode string */
143
144 long mask;
145 Py_ssize_t i;
146
147 mask = 0;
148 for (i = 0; i < len; i++)
149 mask |= (1 << (ptr[i] & 0x1F));
150
151 return mask;
152}
153
154Py_LOCAL_INLINE(int) unicode_member(Py_UNICODE chr, Py_UNICODE* set, Py_ssize_t setlen)
155{
156 Py_ssize_t i;
157
158 for (i = 0; i < setlen; i++)
159 if (set[i] == chr)
160 return 1;
161
162 return 0;
163}
164
165#define BLOOM_MEMBER(mask, chr, set, setlen)\
166 BLOOM(mask, chr) && unicode_member(chr, set, setlen)
167
Guido van Rossumd57fd912000-03-10 22:53:23 +0000168/* --- Unicode Object ----------------------------------------------------- */
169
170static
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000171int unicode_resize(register PyUnicodeObject *unicode,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000172 Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000173{
174 void *oldstr;
Tim Petersced69f82003-09-16 20:30:58 +0000175
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000176 /* Shortcut if there's nothing much to do. */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000177 if (unicode->length == length)
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000178 goto reset;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000179
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000180 /* Resizing shared object (unicode_empty or single character
181 objects) in-place is not allowed. Use PyUnicode_Resize()
182 instead ! */
Thomas Wouters477c8d52006-05-27 19:21:47 +0000183
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000184 if (unicode == unicode_empty ||
185 (unicode->length == 1 &&
186 unicode->str[0] < 256U &&
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000187 unicode_latin1[unicode->str[0]] == unicode)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000188 PyErr_SetString(PyExc_SystemError,
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000189 "can't resize shared unicode objects");
Guido van Rossumd57fd912000-03-10 22:53:23 +0000190 return -1;
191 }
192
Thomas Wouters477c8d52006-05-27 19:21:47 +0000193 /* We allocate one more byte to make sure the string is Ux0000 terminated.
194 The overallocation is also used by fastsearch, which assumes that it's
195 safe to look at str[length] (without making any assumptions about what
196 it contains). */
197
Guido van Rossumd57fd912000-03-10 22:53:23 +0000198 oldstr = unicode->str;
199 PyMem_RESIZE(unicode->str, Py_UNICODE, length + 1);
200 if (!unicode->str) {
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000201 unicode->str = (Py_UNICODE *)oldstr;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000202 PyErr_NoMemory();
203 return -1;
204 }
205 unicode->str[length] = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000206 unicode->length = length;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000207
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000208 reset:
Guido van Rossumd57fd912000-03-10 22:53:23 +0000209 /* Reset the object caches */
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000210 if (unicode->defenc) {
211 Py_DECREF(unicode->defenc);
212 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000213 }
214 unicode->hash = -1;
Tim Petersced69f82003-09-16 20:30:58 +0000215
Guido van Rossumd57fd912000-03-10 22:53:23 +0000216 return 0;
217}
218
219/* We allocate one more byte to make sure the string is
Tim Petersced69f82003-09-16 20:30:58 +0000220 Ux0000 terminated -- XXX is this needed ?
Guido van Rossumd57fd912000-03-10 22:53:23 +0000221
222 XXX This allocator could further be enhanced by assuring that the
223 free list never reduces its size below 1.
224
225*/
226
227static
Martin v. Löwis18e16552006-02-15 17:27:45 +0000228PyUnicodeObject *_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000229{
230 register PyUnicodeObject *unicode;
231
Thomas Wouters477c8d52006-05-27 19:21:47 +0000232 /* Optimization for empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000233 if (length == 0 && unicode_empty != NULL) {
234 Py_INCREF(unicode_empty);
235 return unicode_empty;
236 }
237
238 /* Unicode freelist & memory allocation */
239 if (unicode_freelist) {
240 unicode = unicode_freelist;
Marc-André Lemburgbea47e72000-06-17 20:31:17 +0000241 unicode_freelist = *(PyUnicodeObject **)unicode;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000242 unicode_freelist_size--;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000243 if (unicode->str) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000244 /* Keep-Alive optimization: we only upsize the buffer,
245 never downsize it. */
Tim Petersced69f82003-09-16 20:30:58 +0000246 if ((unicode->length < length) &&
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +0000247 unicode_resize(unicode, length) < 0) {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000248 PyMem_DEL(unicode->str);
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000249 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000250 }
251 }
Guido van Rossumad98db12001-06-14 17:52:02 +0000252 else {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000253 unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
Guido van Rossumad98db12001-06-14 17:52:02 +0000254 }
255 PyObject_INIT(unicode, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000256 }
257 else {
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000258 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000259 if (unicode == NULL)
260 return NULL;
261 unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
262 }
263
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000264 if (!unicode->str) {
265 PyErr_NoMemory();
Barry Warsaw51ac5802000-03-20 16:36:48 +0000266 goto onError;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000267 }
Jeremy Hyltond8082792003-09-16 19:41:39 +0000268 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +0000269 * the caller fails before initializing str -- unicode_resize()
270 * reads str[0], and the Keep-Alive optimization can keep memory
271 * allocated for str alive across a call to unicode_dealloc(unicode).
272 * We don't want unicode_resize to read uninitialized memory in
273 * that case.
274 */
Jeremy Hyltond8082792003-09-16 19:41:39 +0000275 unicode->str[0] = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000276 unicode->str[length] = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000277 unicode->length = length;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000278 unicode->hash = -1;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000279 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000280 return unicode;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000281
282 onError:
283 _Py_ForgetReference((PyObject *)unicode);
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000284 PyObject_Del(unicode);
Barry Warsaw51ac5802000-03-20 16:36:48 +0000285 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000286}
287
288static
Guido van Rossum9475a232001-10-05 20:51:39 +0000289void unicode_dealloc(register PyUnicodeObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000290{
Guido van Rossum604ddf82001-12-06 20:03:56 +0000291 if (PyUnicode_CheckExact(unicode) &&
292 unicode_freelist_size < MAX_UNICODE_FREELIST_SIZE) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000293 /* Keep-Alive optimization */
294 if (unicode->length >= KEEPALIVE_SIZE_LIMIT) {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000295 PyMem_DEL(unicode->str);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000296 unicode->str = NULL;
297 unicode->length = 0;
298 }
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000299 if (unicode->defenc) {
300 Py_DECREF(unicode->defenc);
301 unicode->defenc = NULL;
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000302 }
303 /* Add to free list */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000304 *(PyUnicodeObject **)unicode = unicode_freelist;
305 unicode_freelist = unicode;
306 unicode_freelist_size++;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000307 }
308 else {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000309 PyMem_DEL(unicode->str);
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000310 Py_XDECREF(unicode->defenc);
Guido van Rossum604ddf82001-12-06 20:03:56 +0000311 unicode->ob_type->tp_free((PyObject *)unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000312 }
313}
314
Martin v. Löwis18e16552006-02-15 17:27:45 +0000315int PyUnicode_Resize(PyObject **unicode, Py_ssize_t length)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000316{
317 register PyUnicodeObject *v;
318
319 /* Argument checks */
320 if (unicode == NULL) {
321 PyErr_BadInternalCall();
322 return -1;
323 }
324 v = (PyUnicodeObject *)*unicode;
Guido van Rossum049cd6b2002-10-11 00:43:48 +0000325 if (v == NULL || !PyUnicode_Check(v) || v->ob_refcnt != 1 || length < 0) {
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000326 PyErr_BadInternalCall();
327 return -1;
328 }
329
330 /* Resizing unicode_empty and single character objects is not
331 possible since these are being shared. We simply return a fresh
332 copy with the same Unicode content. */
Tim Petersced69f82003-09-16 20:30:58 +0000333 if (v->length != length &&
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000334 (v == unicode_empty || v->length == 1)) {
335 PyUnicodeObject *w = _PyUnicode_New(length);
336 if (w == NULL)
337 return -1;
338 Py_UNICODE_COPY(w->str, v->str,
339 length < v->length ? length : v->length);
Raymond Hettingerc8df5782003-03-09 07:30:43 +0000340 Py_DECREF(*unicode);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000341 *unicode = (PyObject *)w;
342 return 0;
343 }
344
345 /* Note that we don't have to modify *unicode for unshared Unicode
346 objects, since we can modify them in-place. */
347 return unicode_resize(v, length);
348}
349
350/* Internal API for use in unicodeobject.c only ! */
351#define _PyUnicode_Resize(unicodevar, length) \
352 PyUnicode_Resize(((PyObject **)(unicodevar)), length)
353
Guido van Rossumd57fd912000-03-10 22:53:23 +0000354PyObject *PyUnicode_FromUnicode(const Py_UNICODE *u,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000355 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000356{
357 PyUnicodeObject *unicode;
358
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000359 /* If the Unicode data is known at construction time, we can apply
360 some optimizations which share commonly used objects. */
361 if (u != NULL) {
362
363 /* Optimization for empty strings */
364 if (size == 0 && unicode_empty != NULL) {
365 Py_INCREF(unicode_empty);
366 return (PyObject *)unicode_empty;
367 }
368
369 /* Single character Unicode objects in the Latin-1 range are
370 shared when using this constructor */
371 if (size == 1 && *u < 256) {
372 unicode = unicode_latin1[*u];
373 if (!unicode) {
374 unicode = _PyUnicode_New(1);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000375 if (!unicode)
376 return NULL;
Marc-André Lemburg8879a332001-06-07 12:26:56 +0000377 unicode->str[0] = *u;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000378 unicode_latin1[*u] = unicode;
379 }
380 Py_INCREF(unicode);
381 return (PyObject *)unicode;
382 }
383 }
Tim Petersced69f82003-09-16 20:30:58 +0000384
Guido van Rossumd57fd912000-03-10 22:53:23 +0000385 unicode = _PyUnicode_New(size);
386 if (!unicode)
387 return NULL;
388
389 /* Copy the Unicode data into the new object */
390 if (u != NULL)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000391 Py_UNICODE_COPY(unicode->str, u, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000392
393 return (PyObject *)unicode;
394}
395
Walter Dörwaldd2034312007-05-18 16:29:38 +0000396PyObject *PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000397{
398 PyUnicodeObject *unicode;
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000399 /* If the Unicode data is known at construction time, we can apply
400 some optimizations which share commonly used objects. */
401 if (u != NULL) {
402
403 /* Optimization for empty strings */
404 if (size == 0 && unicode_empty != NULL) {
405 Py_INCREF(unicode_empty);
406 return (PyObject *)unicode_empty;
407 }
408
Walter Dörwald071b9da2007-05-05 14:21:20 +0000409 /* Single characters are shared when using this constructor */
410 if (size == 1) {
Walter Dörwaldce32db32007-05-05 14:26:59 +0000411 unicode = unicode_latin1[(int)*u];
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000412 if (!unicode) {
413 unicode = _PyUnicode_New(1);
414 if (!unicode)
415 return NULL;
416 unicode->str[0] = *u;
Walter Dörwaldce32db32007-05-05 14:26:59 +0000417 unicode_latin1[(int)*u] = unicode;
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000418 }
419 Py_INCREF(unicode);
420 return (PyObject *)unicode;
421 }
422 }
423
Walter Dörwald55507312007-05-18 13:12:10 +0000424 unicode = _PyUnicode_New(size);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000425 if (!unicode)
426 return NULL;
427
428 /* Copy the Unicode data into the new object */
429 if (u != NULL) {
Walter Dörwaldce32db32007-05-05 14:26:59 +0000430 Py_UNICODE *p = unicode->str;
431 while ((*p++ = *u++))
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000432 ;
433 }
434
435 return (PyObject *)unicode;
436}
437
Walter Dörwaldd2034312007-05-18 16:29:38 +0000438PyObject *PyUnicode_FromString(const char *u)
439{
440 size_t size = strlen(u);
441 if (size > PY_SSIZE_T_MAX) {
442 PyErr_SetString(PyExc_OverflowError, "input too long");
443 return NULL;
444 }
445
446 return PyUnicode_FromStringAndSize(u, size);
447}
448
Guido van Rossumd57fd912000-03-10 22:53:23 +0000449#ifdef HAVE_WCHAR_H
450
451PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000452 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000453{
454 PyUnicodeObject *unicode;
455
456 if (w == NULL) {
457 PyErr_BadInternalCall();
458 return NULL;
459 }
460
461 unicode = _PyUnicode_New(size);
462 if (!unicode)
463 return NULL;
464
465 /* Copy the wchar_t data into the new object */
466#ifdef HAVE_USABLE_WCHAR_T
467 memcpy(unicode->str, w, size * sizeof(wchar_t));
Tim Petersced69f82003-09-16 20:30:58 +0000468#else
Guido van Rossumd57fd912000-03-10 22:53:23 +0000469 {
470 register Py_UNICODE *u;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000471 register Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000472 u = PyUnicode_AS_UNICODE(unicode);
Marc-André Lemburg204bd6d2004-10-15 07:45:05 +0000473 for (i = size; i > 0; i--)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000474 *u++ = *w++;
475 }
476#endif
477
478 return (PyObject *)unicode;
479}
480
Walter Dörwaldd2034312007-05-18 16:29:38 +0000481#define appendstring(string) {for (copy = string;*copy;) *s++ = *copy++;}
482
483PyObject *
484PyUnicode_FromFormatV(const char *format, va_list vargs)
485{
486 va_list count;
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000487 Py_ssize_t callcount = 0;
488 PyObject **callresults = NULL;
489 PyObject **callresult;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000490 Py_ssize_t n = 0;
491 const char* f;
492 Py_UNICODE *s;
493 PyObject *string;
494 /* used by sprintf */
495 char buffer[21];
496 const char *copy;
497
498#ifdef VA_LIST_IS_ARRAY
499 Py_MEMCPY(count, vargs, sizeof(va_list));
500#else
501#ifdef __va_copy
502 __va_copy(count, vargs);
503#else
504 count = vargs;
505#endif
506#endif
Walter Dörwald1be7e3f2007-05-23 21:02:42 +0000507 /* step 1: count the number of %S/%R format specifications
508 * (we call PyObject_Unicode()/PyObject_Repr() for these objects
509 * once during step 3 and put the result in an array) */
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000510 for (f = format; *f; f++) {
Walter Dörwald1be7e3f2007-05-23 21:02:42 +0000511 if (*f == '%' && (*(f+1)=='S' || *(f+1)=='R'))
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000512 ++callcount;
513 }
Walter Dörwald1be7e3f2007-05-23 21:02:42 +0000514 /* step 2: allocate memory for the results of
515 * PyObject_Unicode()/PyObject_Repr() calls */
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000516 if (callcount) {
517 callresults = PyMem_Malloc(sizeof(PyObject *)*callcount);
518 if (!callresults) {
519 PyErr_NoMemory();
520 return NULL;
521 }
522 callresult = callresults;
523 }
524 /* step 3: figure out how large a buffer we need */
Walter Dörwaldd2034312007-05-18 16:29:38 +0000525 for (f = format; *f; f++) {
526 if (*f == '%') {
527 const char* p = f;
528 while (*++f && *f != '%' && !isalpha(Py_CHARMASK(*f)))
529 ;
530
531 /* skip the 'l' or 'z' in {%ld, %zd, %lu, %zu} since
532 * they don't affect the amount of space we reserve.
533 */
534 if ((*f == 'l' || *f == 'z') &&
535 (f[1] == 'd' || f[1] == 'u'))
536 ++f;
537
538 switch (*f) {
539 case 'c':
540 (void)va_arg(count, int);
541 /* fall through... */
542 case '%':
543 n++;
544 break;
545 case 'd': case 'u': case 'i': case 'x':
546 (void) va_arg(count, int);
547 /* 20 bytes is enough to hold a 64-bit
548 integer. Decimal takes the most space.
549 This isn't enough for octal. */
550 n += 20;
551 break;
552 case 's':
553 n += strlen(va_arg(count, char*));
554 break;
555 case 'U':
556 {
557 PyObject *obj = va_arg(count, PyObject *);
558 assert(obj && PyUnicode_Check(obj));
559 n += PyUnicode_GET_SIZE(obj);
560 break;
561 }
Walter Dörwald1be7e3f2007-05-23 21:02:42 +0000562 case 'S':
563 {
564 PyObject *obj = va_arg(count, PyObject *);
565 PyObject *str;
566 assert(obj);
567 str = PyObject_Unicode(obj);
568 if (!str)
569 goto fail;
570 n += PyUnicode_GET_SIZE(str);
571 /* Remember the str and switch to the next slot */
572 *callresult++ = str;
573 break;
574 }
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000575 case 'R':
576 {
577 PyObject *obj = va_arg(count, PyObject *);
578 PyObject *repr;
579 assert(obj);
580 repr = PyObject_Repr(obj);
581 if (!repr)
582 goto fail;
583 n += PyUnicode_GET_SIZE(repr);
584 /* Remember the repr and switch to the next slot */
585 *callresult++ = repr;
586 break;
587 }
Walter Dörwaldd2034312007-05-18 16:29:38 +0000588 case 'p':
589 (void) va_arg(count, int);
590 /* maximum 64-bit pointer representation:
591 * 0xffffffffffffffff
592 * so 19 characters is enough.
593 * XXX I count 18 -- what's the extra for?
594 */
595 n += 19;
596 break;
597 default:
598 /* if we stumble upon an unknown
599 formatting code, copy the rest of
600 the format string to the output
601 string. (we cannot just skip the
602 code, since there's no way to know
603 what's in the argument list) */
604 n += strlen(p);
605 goto expand;
606 }
607 } else
608 n++;
609 }
610 expand:
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000611 /* step 4: fill the buffer */
Walter Dörwaldd2034312007-05-18 16:29:38 +0000612 /* Since we've analyzed how much space we need for the worst case,
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000613 we don't have to resize the string.
614 There can be no errors beyond this point. */
Walter Dörwaldd2034312007-05-18 16:29:38 +0000615 string = PyUnicode_FromUnicode(NULL, n);
616 if (!string)
617 return NULL;
618
619 s = PyUnicode_AS_UNICODE(string);
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000620 callresult = callresults;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000621
622 for (f = format; *f; f++) {
623 if (*f == '%') {
624 const char* p = f++;
625 int longflag = 0;
626 int size_tflag = 0;
627 /* parse the width.precision part (we're only
628 interested in the precision value, if any) */
629 n = 0;
630 while (isdigit(Py_CHARMASK(*f)))
631 n = (n*10) + *f++ - '0';
632 if (*f == '.') {
633 f++;
634 n = 0;
635 while (isdigit(Py_CHARMASK(*f)))
636 n = (n*10) + *f++ - '0';
637 }
638 while (*f && *f != '%' && !isalpha(Py_CHARMASK(*f)))
639 f++;
640 /* handle the long flag, but only for %ld and %lu.
641 others can be added when necessary. */
642 if (*f == 'l' && (f[1] == 'd' || f[1] == 'u')) {
643 longflag = 1;
644 ++f;
645 }
646 /* handle the size_t flag. */
647 if (*f == 'z' && (f[1] == 'd' || f[1] == 'u')) {
648 size_tflag = 1;
649 ++f;
650 }
651
652 switch (*f) {
653 case 'c':
654 *s++ = va_arg(vargs, int);
655 break;
656 case 'd':
657 if (longflag)
658 sprintf(buffer, "%ld", va_arg(vargs, long));
659 else if (size_tflag)
660 sprintf(buffer, "%" PY_FORMAT_SIZE_T "d",
661 va_arg(vargs, Py_ssize_t));
662 else
663 sprintf(buffer, "%d", va_arg(vargs, int));
664 appendstring(buffer);
665 break;
666 case 'u':
667 if (longflag)
668 sprintf(buffer, "%lu",
669 va_arg(vargs, unsigned long));
670 else if (size_tflag)
671 sprintf(buffer, "%" PY_FORMAT_SIZE_T "u",
672 va_arg(vargs, size_t));
673 else
674 sprintf(buffer, "%u",
675 va_arg(vargs, unsigned int));
676 appendstring(buffer);
677 break;
678 case 'i':
679 sprintf(buffer, "%i", va_arg(vargs, int));
680 appendstring(buffer);
681 break;
682 case 'x':
683 sprintf(buffer, "%x", va_arg(vargs, int));
684 appendstring(buffer);
685 break;
686 case 's':
687 p = va_arg(vargs, char*);
688 appendstring(p);
689 break;
690 case 'U':
691 {
692 PyObject *obj = va_arg(vargs, PyObject *);
693 Py_UNICODE *ucopy = PyUnicode_AS_UNICODE(obj);
694 Py_ssize_t usize = PyUnicode_GET_SIZE(obj);
695 Py_ssize_t upos;
696 for (upos = 0; upos<usize;)
697 *s++ = ucopy[upos++];
698 break;
699 }
Walter Dörwald1be7e3f2007-05-23 21:02:42 +0000700 case 'S':
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000701 case 'R':
702 {
703 /* unused, since we already have the result */
704 (void) va_arg(vargs, PyObject *);
705 Py_UNICODE *ucopy = PyUnicode_AS_UNICODE(*callresult);
706 Py_ssize_t usize = PyUnicode_GET_SIZE(*callresult);
707 Py_ssize_t upos;
708 for (upos = 0; upos<usize;)
709 *s++ = ucopy[upos++];
Walter Dörwald1be7e3f2007-05-23 21:02:42 +0000710 /* We're done with the unicode()/repr() => forget it */
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000711 Py_DECREF(*callresult);
Walter Dörwald1be7e3f2007-05-23 21:02:42 +0000712 /* switch to next unicode()/repr() result */
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000713 ++callresult;
714 break;
715 }
Walter Dörwaldd2034312007-05-18 16:29:38 +0000716 case 'p':
717 sprintf(buffer, "%p", va_arg(vargs, void*));
718 /* %p is ill-defined: ensure leading 0x. */
719 if (buffer[1] == 'X')
720 buffer[1] = 'x';
721 else if (buffer[1] != 'x') {
722 memmove(buffer+2, buffer, strlen(buffer)+1);
723 buffer[0] = '0';
724 buffer[1] = 'x';
725 }
726 appendstring(buffer);
727 break;
728 case '%':
729 *s++ = '%';
730 break;
731 default:
732 appendstring(p);
733 goto end;
734 }
735 } else
736 *s++ = *f;
737 }
738
739 end:
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000740 if (callresults)
741 PyMem_Free(callresults);
Walter Dörwaldd2034312007-05-18 16:29:38 +0000742 _PyUnicode_Resize(&string, s - PyUnicode_AS_UNICODE(string));
743 return string;
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000744 fail:
745 if (callresults) {
746 PyObject **callresult2 = callresults;
747 while (callresult2 <= callresult) {
748 Py_DECREF(*callresult2);
749 ++callresult2;
750 }
751 PyMem_Free(callresults);
752 }
753 return NULL;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000754}
755
756#undef appendstring
757
758PyObject *
759PyUnicode_FromFormat(const char *format, ...)
760{
761 PyObject* ret;
762 va_list vargs;
763
764#ifdef HAVE_STDARG_PROTOTYPES
765 va_start(vargs, format);
766#else
767 va_start(vargs);
768#endif
769 ret = PyUnicode_FromFormatV(format, vargs);
770 va_end(vargs);
771 return ret;
772}
773
Martin v. Löwis18e16552006-02-15 17:27:45 +0000774Py_ssize_t PyUnicode_AsWideChar(PyUnicodeObject *unicode,
775 wchar_t *w,
776 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000777{
778 if (unicode == NULL) {
779 PyErr_BadInternalCall();
780 return -1;
781 }
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +0000782
783 /* If possible, try to copy the 0-termination as well */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000784 if (size > PyUnicode_GET_SIZE(unicode))
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +0000785 size = PyUnicode_GET_SIZE(unicode) + 1;
786
Guido van Rossumd57fd912000-03-10 22:53:23 +0000787#ifdef HAVE_USABLE_WCHAR_T
788 memcpy(w, unicode->str, size * sizeof(wchar_t));
789#else
790 {
791 register Py_UNICODE *u;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000792 register Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000793 u = PyUnicode_AS_UNICODE(unicode);
Marc-André Lemburg204bd6d2004-10-15 07:45:05 +0000794 for (i = size; i > 0; i--)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000795 *w++ = *u++;
796 }
797#endif
798
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +0000799 if (size > PyUnicode_GET_SIZE(unicode))
800 return PyUnicode_GET_SIZE(unicode);
801 else
Guido van Rossumd57fd912000-03-10 22:53:23 +0000802 return size;
803}
804
805#endif
806
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000807PyObject *PyUnicode_FromOrdinal(int ordinal)
808{
Hye-Shik Chang40574832004-04-06 07:24:51 +0000809 Py_UNICODE s[1];
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000810
811#ifdef Py_UNICODE_WIDE
812 if (ordinal < 0 || ordinal > 0x10ffff) {
813 PyErr_SetString(PyExc_ValueError,
814 "unichr() arg not in range(0x110000) "
815 "(wide Python build)");
816 return NULL;
817 }
818#else
819 if (ordinal < 0 || ordinal > 0xffff) {
820 PyErr_SetString(PyExc_ValueError,
821 "unichr() arg not in range(0x10000) "
822 "(narrow Python build)");
823 return NULL;
824 }
825#endif
826
Hye-Shik Chang40574832004-04-06 07:24:51 +0000827 s[0] = (Py_UNICODE)ordinal;
828 return PyUnicode_FromUnicode(s, 1);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000829}
830
Guido van Rossumd57fd912000-03-10 22:53:23 +0000831PyObject *PyUnicode_FromObject(register PyObject *obj)
832{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000833 /* XXX Perhaps we should make this API an alias of
834 PyObject_Unicode() instead ?! */
835 if (PyUnicode_CheckExact(obj)) {
836 Py_INCREF(obj);
837 return obj;
838 }
839 if (PyUnicode_Check(obj)) {
840 /* For a Unicode subtype that's not a Unicode object,
841 return a true Unicode object with the same data. */
842 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(obj),
843 PyUnicode_GET_SIZE(obj));
844 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000845 return PyUnicode_FromEncodedObject(obj, NULL, "strict");
846}
847
848PyObject *PyUnicode_FromEncodedObject(register PyObject *obj,
849 const char *encoding,
850 const char *errors)
851{
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000852 const char *s = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +0000853 Py_ssize_t len;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000854 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +0000855
Guido van Rossumd57fd912000-03-10 22:53:23 +0000856 if (obj == NULL) {
857 PyErr_BadInternalCall();
858 return NULL;
859 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000860
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000861#if 0
862 /* For b/w compatibility we also accept Unicode objects provided
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +0000863 that no encodings is given and then redirect to
864 PyObject_Unicode() which then applies the additional logic for
865 Unicode subclasses.
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000866
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000867 NOTE: This API should really only be used for object which
868 represent *encoded* Unicode !
869
870 */
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000871 if (PyUnicode_Check(obj)) {
872 if (encoding) {
873 PyErr_SetString(PyExc_TypeError,
874 "decoding Unicode is not supported");
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000875 return NULL;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000876 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000877 return PyObject_Unicode(obj);
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000878 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000879#else
880 if (PyUnicode_Check(obj)) {
881 PyErr_SetString(PyExc_TypeError,
882 "decoding Unicode is not supported");
883 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000884 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000885#endif
886
887 /* Coerce object */
888 if (PyString_Check(obj)) {
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000889 s = PyString_AS_STRING(obj);
890 len = PyString_GET_SIZE(obj);
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000891 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000892 else if (PyObject_AsCharBuffer(obj, &s, &len)) {
893 /* Overwrite the error message with something more useful in
894 case of a TypeError. */
895 if (PyErr_ExceptionMatches(PyExc_TypeError))
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000896 PyErr_Format(PyExc_TypeError,
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000897 "coercing to Unicode: need string or buffer, "
898 "%.80s found",
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000899 obj->ob_type->tp_name);
900 goto onError;
901 }
Tim Petersced69f82003-09-16 20:30:58 +0000902
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000903 /* Convert to Unicode */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000904 if (len == 0) {
905 Py_INCREF(unicode_empty);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000906 v = (PyObject *)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000907 }
Tim Petersced69f82003-09-16 20:30:58 +0000908 else
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000909 v = PyUnicode_Decode(s, len, encoding, errors);
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +0000910
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000911 return v;
912
913 onError:
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000914 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000915}
916
917PyObject *PyUnicode_Decode(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000918 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +0000919 const char *encoding,
920 const char *errors)
921{
922 PyObject *buffer = NULL, *unicode;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000923
924 if (encoding == NULL)
Fred Drakee4315f52000-05-09 19:53:39 +0000925 encoding = PyUnicode_GetDefaultEncoding();
926
927 /* Shortcuts for common default encodings */
928 if (strcmp(encoding, "utf-8") == 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000929 return PyUnicode_DecodeUTF8(s, size, errors);
Fred Drakee4315f52000-05-09 19:53:39 +0000930 else if (strcmp(encoding, "latin-1") == 0)
931 return PyUnicode_DecodeLatin1(s, size, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +0000932#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
933 else if (strcmp(encoding, "mbcs") == 0)
934 return PyUnicode_DecodeMBCS(s, size, errors);
935#endif
Fred Drakee4315f52000-05-09 19:53:39 +0000936 else if (strcmp(encoding, "ascii") == 0)
937 return PyUnicode_DecodeASCII(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000938
939 /* Decode via the codec registry */
940 buffer = PyBuffer_FromMemory((void *)s, size);
941 if (buffer == NULL)
942 goto onError;
943 unicode = PyCodec_Decode(buffer, encoding, errors);
944 if (unicode == NULL)
945 goto onError;
946 if (!PyUnicode_Check(unicode)) {
947 PyErr_Format(PyExc_TypeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000948 "decoder did not return an unicode object (type=%.400s)",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000949 unicode->ob_type->tp_name);
950 Py_DECREF(unicode);
951 goto onError;
952 }
953 Py_DECREF(buffer);
954 return unicode;
Tim Petersced69f82003-09-16 20:30:58 +0000955
Guido van Rossumd57fd912000-03-10 22:53:23 +0000956 onError:
957 Py_XDECREF(buffer);
958 return NULL;
959}
960
Marc-André Lemburgd2d45982004-07-08 17:57:32 +0000961PyObject *PyUnicode_AsDecodedObject(PyObject *unicode,
962 const char *encoding,
963 const char *errors)
964{
965 PyObject *v;
966
967 if (!PyUnicode_Check(unicode)) {
968 PyErr_BadArgument();
969 goto onError;
970 }
971
972 if (encoding == NULL)
973 encoding = PyUnicode_GetDefaultEncoding();
974
975 /* Decode via the codec registry */
976 v = PyCodec_Decode(unicode, encoding, errors);
977 if (v == NULL)
978 goto onError;
979 return v;
980
981 onError:
982 return NULL;
983}
984
Guido van Rossumd57fd912000-03-10 22:53:23 +0000985PyObject *PyUnicode_Encode(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000986 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +0000987 const char *encoding,
988 const char *errors)
989{
990 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +0000991
Guido van Rossumd57fd912000-03-10 22:53:23 +0000992 unicode = PyUnicode_FromUnicode(s, size);
993 if (unicode == NULL)
994 return NULL;
995 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
996 Py_DECREF(unicode);
997 return v;
998}
999
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001000PyObject *PyUnicode_AsEncodedObject(PyObject *unicode,
1001 const char *encoding,
1002 const char *errors)
1003{
1004 PyObject *v;
1005
1006 if (!PyUnicode_Check(unicode)) {
1007 PyErr_BadArgument();
1008 goto onError;
1009 }
1010
1011 if (encoding == NULL)
1012 encoding = PyUnicode_GetDefaultEncoding();
1013
1014 /* Encode via the codec registry */
1015 v = PyCodec_Encode(unicode, encoding, errors);
1016 if (v == NULL)
1017 goto onError;
1018 return v;
1019
1020 onError:
1021 return NULL;
1022}
1023
Guido van Rossumd57fd912000-03-10 22:53:23 +00001024PyObject *PyUnicode_AsEncodedString(PyObject *unicode,
1025 const char *encoding,
1026 const char *errors)
1027{
1028 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00001029
Guido van Rossumd57fd912000-03-10 22:53:23 +00001030 if (!PyUnicode_Check(unicode)) {
1031 PyErr_BadArgument();
1032 goto onError;
1033 }
Fred Drakee4315f52000-05-09 19:53:39 +00001034
Tim Petersced69f82003-09-16 20:30:58 +00001035 if (encoding == NULL)
Fred Drakee4315f52000-05-09 19:53:39 +00001036 encoding = PyUnicode_GetDefaultEncoding();
1037
1038 /* Shortcuts for common default encodings */
1039 if (errors == NULL) {
1040 if (strcmp(encoding, "utf-8") == 0)
Jeremy Hylton9cea41c2001-05-29 17:13:15 +00001041 return PyUnicode_AsUTF8String(unicode);
Fred Drakee4315f52000-05-09 19:53:39 +00001042 else if (strcmp(encoding, "latin-1") == 0)
1043 return PyUnicode_AsLatin1String(unicode);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001044#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
1045 else if (strcmp(encoding, "mbcs") == 0)
1046 return PyUnicode_AsMBCSString(unicode);
1047#endif
Fred Drakee4315f52000-05-09 19:53:39 +00001048 else if (strcmp(encoding, "ascii") == 0)
1049 return PyUnicode_AsASCIIString(unicode);
1050 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001051
1052 /* Encode via the codec registry */
1053 v = PyCodec_Encode(unicode, encoding, errors);
1054 if (v == NULL)
1055 goto onError;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001056 if (!PyBytes_Check(v)) {
1057 if (PyString_Check(v)) {
1058 /* Old codec, turn it into bytes */
1059 PyObject *b = PyBytes_FromObject(v);
1060 Py_DECREF(v);
1061 return b;
1062 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001063 PyErr_Format(PyExc_TypeError,
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001064 "encoder did not return a bytes object "
1065 "(type=%.400s, encoding=%.20s, errors=%.20s)",
1066 v->ob_type->tp_name,
1067 encoding ? encoding : "NULL",
1068 errors ? errors : "NULL");
Guido van Rossumd57fd912000-03-10 22:53:23 +00001069 Py_DECREF(v);
1070 goto onError;
1071 }
1072 return v;
Tim Petersced69f82003-09-16 20:30:58 +00001073
Guido van Rossumd57fd912000-03-10 22:53:23 +00001074 onError:
1075 return NULL;
1076}
1077
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001078PyObject *_PyUnicode_AsDefaultEncodedString(PyObject *unicode,
1079 const char *errors)
1080{
1081 PyObject *v = ((PyUnicodeObject *)unicode)->defenc;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001082 PyObject *b;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001083 if (v)
1084 return v;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001085 if (errors != NULL)
1086 Py_FatalError("non-NULL encoding in _PyUnicode_AsDefaultEncodedString");
1087 if (errors == NULL) {
1088 b = PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
1089 PyUnicode_GET_SIZE(unicode),
1090 NULL);
1091 }
1092 else {
1093 b = PyUnicode_AsEncodedString(unicode, NULL, errors);
1094 }
1095 if (!b)
1096 return NULL;
1097 v = PyString_FromStringAndSize(PyBytes_AsString(b),
1098 PyBytes_Size(b));
1099 Py_DECREF(b);
1100 if (!errors) {
1101 Py_XINCREF(v);
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001102 ((PyUnicodeObject *)unicode)->defenc = v;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001103 }
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001104 return v;
1105}
1106
Guido van Rossumd57fd912000-03-10 22:53:23 +00001107Py_UNICODE *PyUnicode_AsUnicode(PyObject *unicode)
1108{
1109 if (!PyUnicode_Check(unicode)) {
1110 PyErr_BadArgument();
1111 goto onError;
1112 }
1113 return PyUnicode_AS_UNICODE(unicode);
1114
1115 onError:
1116 return NULL;
1117}
1118
Martin v. Löwis18e16552006-02-15 17:27:45 +00001119Py_ssize_t PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001120{
1121 if (!PyUnicode_Check(unicode)) {
1122 PyErr_BadArgument();
1123 goto onError;
1124 }
1125 return PyUnicode_GET_SIZE(unicode);
1126
1127 onError:
1128 return -1;
1129}
1130
Thomas Wouters78890102000-07-22 19:25:51 +00001131const char *PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +00001132{
1133 return unicode_default_encoding;
1134}
1135
1136int PyUnicode_SetDefaultEncoding(const char *encoding)
1137{
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001138 if (strcmp(encoding, unicode_default_encoding) != 0) {
1139 PyErr_Format(PyExc_ValueError,
1140 "Can only set default encoding to %s",
1141 unicode_default_encoding);
1142 return -1;
1143 }
Fred Drakee4315f52000-05-09 19:53:39 +00001144 return 0;
Fred Drakee4315f52000-05-09 19:53:39 +00001145}
1146
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001147/* error handling callback helper:
1148 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +00001149 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001150 and adjust various state variables.
1151 return 0 on success, -1 on error
1152*/
1153
1154static
1155int unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
1156 const char *encoding, const char *reason,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001157 const char *input, Py_ssize_t insize, Py_ssize_t *startinpos, Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
1158 PyObject **output, Py_ssize_t *outpos, Py_UNICODE **outptr)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001159{
Martin v. Löwis18e16552006-02-15 17:27:45 +00001160 static char *argparse = "O!n;decoding error handler must return (unicode, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001161
1162 PyObject *restuple = NULL;
1163 PyObject *repunicode = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001164 Py_ssize_t outsize = PyUnicode_GET_SIZE(*output);
1165 Py_ssize_t requiredsize;
1166 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001167 Py_UNICODE *repptr;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001168 Py_ssize_t repsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001169 int res = -1;
1170
1171 if (*errorHandler == NULL) {
1172 *errorHandler = PyCodec_LookupError(errors);
1173 if (*errorHandler == NULL)
1174 goto onError;
1175 }
1176
1177 if (*exceptionObject == NULL) {
1178 *exceptionObject = PyUnicodeDecodeError_Create(
1179 encoding, input, insize, *startinpos, *endinpos, reason);
1180 if (*exceptionObject == NULL)
1181 goto onError;
1182 }
1183 else {
1184 if (PyUnicodeDecodeError_SetStart(*exceptionObject, *startinpos))
1185 goto onError;
1186 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, *endinpos))
1187 goto onError;
1188 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
1189 goto onError;
1190 }
1191
1192 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
1193 if (restuple == NULL)
1194 goto onError;
1195 if (!PyTuple_Check(restuple)) {
1196 PyErr_Format(PyExc_TypeError, &argparse[4]);
1197 goto onError;
1198 }
1199 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
1200 goto onError;
1201 if (newpos<0)
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00001202 newpos = insize+newpos;
1203 if (newpos<0 || newpos>insize) {
Martin v. Löwis2c95cc62006-02-16 06:54:25 +00001204 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00001205 goto onError;
1206 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001207
1208 /* need more space? (at least enough for what we
1209 have+the replacement+the rest of the string (starting
1210 at the new input position), so we won't have to check space
1211 when there are no errors in the rest of the string) */
1212 repptr = PyUnicode_AS_UNICODE(repunicode);
1213 repsize = PyUnicode_GET_SIZE(repunicode);
1214 requiredsize = *outpos + repsize + insize-newpos;
1215 if (requiredsize > outsize) {
1216 if (requiredsize<2*outsize)
1217 requiredsize = 2*outsize;
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00001218 if (PyUnicode_Resize(output, requiredsize) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001219 goto onError;
1220 *outptr = PyUnicode_AS_UNICODE(*output) + *outpos;
1221 }
1222 *endinpos = newpos;
1223 *inptr = input + newpos;
1224 Py_UNICODE_COPY(*outptr, repptr, repsize);
1225 *outptr += repsize;
1226 *outpos += repsize;
1227 /* we made it! */
1228 res = 0;
1229
1230 onError:
1231 Py_XDECREF(restuple);
1232 return res;
1233}
1234
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001235/* --- UTF-7 Codec -------------------------------------------------------- */
1236
1237/* see RFC2152 for details */
1238
Tim Petersced69f82003-09-16 20:30:58 +00001239static
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001240char utf7_special[128] = {
1241 /* indicate whether a UTF-7 character is special i.e. cannot be directly
1242 encoded:
1243 0 - not special
1244 1 - special
1245 2 - whitespace (optional)
1246 3 - RFC2152 Set O (optional) */
1247 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 2, 1, 1,
1248 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1249 2, 3, 3, 3, 3, 3, 3, 0, 0, 0, 3, 1, 0, 0, 0, 1,
1250 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 0,
1251 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1252 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 1, 3, 3, 3,
1253 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1254 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 1, 1,
1255
1256};
1257
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001258/* Note: The comparison (c) <= 0 is a trick to work-around gcc
1259 warnings about the comparison always being false; since
1260 utf7_special[0] is 1, we can safely make that one comparison
1261 true */
1262
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001263#define SPECIAL(c, encodeO, encodeWS) \
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001264 ((c) > 127 || (c) <= 0 || utf7_special[(c)] == 1 || \
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +00001265 (encodeWS && (utf7_special[(c)] == 2)) || \
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001266 (encodeO && (utf7_special[(c)] == 3)))
1267
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001268#define B64(n) \
1269 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
1270#define B64CHAR(c) \
1271 (isalnum(c) || (c) == '+' || (c) == '/')
1272#define UB64(c) \
1273 ((c) == '+' ? 62 : (c) == '/' ? 63 : (c) >= 'a' ? \
1274 (c) - 71 : (c) >= 'A' ? (c) - 65 : (c) + 4 )
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001275
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +00001276#define ENCODE(out, ch, bits) \
1277 while (bits >= 6) { \
1278 *out++ = B64(ch >> (bits-6)); \
1279 bits -= 6; \
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001280 }
1281
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +00001282#define DECODE(out, ch, bits, surrogate) \
1283 while (bits >= 16) { \
1284 Py_UNICODE outCh = (Py_UNICODE) ((ch >> (bits-16)) & 0xffff); \
1285 bits -= 16; \
1286 if (surrogate) { \
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001287 /* We have already generated an error for the high surrogate \
1288 so let's not bother seeing if the low surrogate is correct or not */ \
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +00001289 surrogate = 0; \
1290 } else if (0xDC00 <= outCh && outCh <= 0xDFFF) { \
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001291 /* This is a surrogate pair. Unfortunately we can't represent \
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +00001292 it in a 16-bit character */ \
1293 surrogate = 1; \
1294 errmsg = "code pairs are not supported"; \
1295 goto utf7Error; \
1296 } else { \
1297 *out++ = outCh; \
1298 } \
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001299 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001300
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001301PyObject *PyUnicode_DecodeUTF7(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001302 Py_ssize_t size,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001303 const char *errors)
1304{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001305 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001306 Py_ssize_t startinpos;
1307 Py_ssize_t endinpos;
1308 Py_ssize_t outpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001309 const char *e;
1310 PyUnicodeObject *unicode;
1311 Py_UNICODE *p;
1312 const char *errmsg = "";
1313 int inShift = 0;
1314 unsigned int bitsleft = 0;
1315 unsigned long charsleft = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001316 int surrogate = 0;
1317 PyObject *errorHandler = NULL;
1318 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001319
1320 unicode = _PyUnicode_New(size);
1321 if (!unicode)
1322 return NULL;
1323 if (size == 0)
1324 return (PyObject *)unicode;
1325
1326 p = unicode->str;
1327 e = s + size;
1328
1329 while (s < e) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001330 Py_UNICODE ch;
1331 restart:
1332 ch = *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001333
1334 if (inShift) {
1335 if ((ch == '-') || !B64CHAR(ch)) {
1336 inShift = 0;
1337 s++;
Tim Petersced69f82003-09-16 20:30:58 +00001338
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001339 /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
1340 if (bitsleft >= 6) {
1341 /* The shift sequence has a partial character in it. If
1342 bitsleft < 6 then we could just classify it as padding
1343 but that is not the case here */
1344
1345 errmsg = "partial character in shift sequence";
Tim Petersced69f82003-09-16 20:30:58 +00001346 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001347 }
1348 /* According to RFC2152 the remaining bits should be zero. We
Tim Petersced69f82003-09-16 20:30:58 +00001349 choose to signal an error/insert a replacement character
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001350 here so indicate the potential of a misencoded character. */
1351
1352 /* On x86, a << b == a << (b%32) so make sure that bitsleft != 0 */
1353 if (bitsleft && charsleft << (sizeof(charsleft) * 8 - bitsleft)) {
1354 errmsg = "non-zero padding bits in shift sequence";
Tim Petersced69f82003-09-16 20:30:58 +00001355 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001356 }
1357
1358 if (ch == '-') {
1359 if ((s < e) && (*(s) == '-')) {
Tim Petersced69f82003-09-16 20:30:58 +00001360 *p++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001361 inShift = 1;
1362 }
1363 } else if (SPECIAL(ch,0,0)) {
1364 errmsg = "unexpected special character";
Tim Petersced69f82003-09-16 20:30:58 +00001365 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001366 } else {
1367 *p++ = ch;
1368 }
1369 } else {
1370 charsleft = (charsleft << 6) | UB64(ch);
1371 bitsleft += 6;
1372 s++;
1373 /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
1374 }
1375 }
1376 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001377 startinpos = s-starts;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001378 s++;
1379 if (s < e && *s == '-') {
1380 s++;
1381 *p++ = '+';
1382 } else
1383 {
1384 inShift = 1;
1385 bitsleft = 0;
1386 }
1387 }
1388 else if (SPECIAL(ch,0,0)) {
1389 errmsg = "unexpected special character";
1390 s++;
Tim Petersced69f82003-09-16 20:30:58 +00001391 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001392 }
1393 else {
1394 *p++ = ch;
1395 s++;
1396 }
1397 continue;
1398 utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001399 outpos = p-PyUnicode_AS_UNICODE(unicode);
1400 endinpos = s-starts;
1401 if (unicode_decode_call_errorhandler(
1402 errors, &errorHandler,
1403 "utf7", errmsg,
1404 starts, size, &startinpos, &endinpos, &exc, &s,
1405 (PyObject **)&unicode, &outpos, &p))
1406 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001407 }
1408
1409 if (inShift) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001410 outpos = p-PyUnicode_AS_UNICODE(unicode);
1411 endinpos = size;
1412 if (unicode_decode_call_errorhandler(
1413 errors, &errorHandler,
1414 "utf7", "unterminated shift sequence",
1415 starts, size, &startinpos, &endinpos, &exc, &s,
1416 (PyObject **)&unicode, &outpos, &p))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001417 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001418 if (s < e)
1419 goto restart;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001420 }
1421
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00001422 if (_PyUnicode_Resize(&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001423 goto onError;
1424
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001425 Py_XDECREF(errorHandler);
1426 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001427 return (PyObject *)unicode;
1428
1429onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001430 Py_XDECREF(errorHandler);
1431 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001432 Py_DECREF(unicode);
1433 return NULL;
1434}
1435
1436
1437PyObject *PyUnicode_EncodeUTF7(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001438 Py_ssize_t size,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001439 int encodeSetO,
1440 int encodeWhiteSpace,
1441 const char *errors)
1442{
1443 PyObject *v;
1444 /* It might be possible to tighten this worst case */
Martin v. Löwis18e16552006-02-15 17:27:45 +00001445 Py_ssize_t cbAllocated = 5 * size;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001446 int inShift = 0;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001447 Py_ssize_t i = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001448 unsigned int bitsleft = 0;
1449 unsigned long charsleft = 0;
1450 char * out;
1451 char * start;
1452
1453 if (size == 0)
Walter Dörwald51ab4142007-05-05 14:43:36 +00001454 return PyBytes_FromStringAndSize(NULL, 0);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001455
Walter Dörwald51ab4142007-05-05 14:43:36 +00001456 v = PyBytes_FromStringAndSize(NULL, cbAllocated);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001457 if (v == NULL)
1458 return NULL;
1459
Walter Dörwald51ab4142007-05-05 14:43:36 +00001460 start = out = PyBytes_AS_STRING(v);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001461 for (;i < size; ++i) {
1462 Py_UNICODE ch = s[i];
1463
1464 if (!inShift) {
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001465 if (ch == '+') {
1466 *out++ = '+';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001467 *out++ = '-';
1468 } else if (SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
1469 charsleft = ch;
1470 bitsleft = 16;
1471 *out++ = '+';
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001472 /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001473 inShift = bitsleft > 0;
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001474 } else {
1475 *out++ = (char) ch;
1476 }
1477 } else {
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001478 if (!SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
1479 *out++ = B64(charsleft << (6-bitsleft));
1480 charsleft = 0;
1481 bitsleft = 0;
1482 /* Characters not in the BASE64 set implicitly unshift the sequence
1483 so no '-' is required, except if the character is itself a '-' */
1484 if (B64CHAR(ch) || ch == '-') {
1485 *out++ = '-';
1486 }
1487 inShift = 0;
1488 *out++ = (char) ch;
1489 } else {
1490 bitsleft += 16;
1491 charsleft = (charsleft << 16) | ch;
1492 /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
1493
1494 /* If the next character is special then we dont' need to terminate
Tim Petersced69f82003-09-16 20:30:58 +00001495 the shift sequence. If the next character is not a BASE64 character
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001496 or '-' then the shift sequence will be terminated implicitly and we
1497 don't have to insert a '-'. */
1498
1499 if (bitsleft == 0) {
1500 if (i + 1 < size) {
1501 Py_UNICODE ch2 = s[i+1];
1502
1503 if (SPECIAL(ch2, encodeSetO, encodeWhiteSpace)) {
Tim Petersced69f82003-09-16 20:30:58 +00001504
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001505 } else if (B64CHAR(ch2) || ch2 == '-') {
1506 *out++ = '-';
1507 inShift = 0;
1508 } else {
1509 inShift = 0;
1510 }
1511
1512 }
1513 else {
1514 *out++ = '-';
1515 inShift = 0;
1516 }
1517 }
Tim Petersced69f82003-09-16 20:30:58 +00001518 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001519 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001520 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001521 if (bitsleft) {
1522 *out++= B64(charsleft << (6-bitsleft) );
1523 *out++ = '-';
1524 }
1525
Walter Dörwald51ab4142007-05-05 14:43:36 +00001526 if (PyBytes_Resize(v, out - start)) {
1527 Py_DECREF(v);
1528 return NULL;
1529 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001530 return v;
1531}
1532
1533#undef SPECIAL
1534#undef B64
1535#undef B64CHAR
1536#undef UB64
1537#undef ENCODE
1538#undef DECODE
1539
Guido van Rossumd57fd912000-03-10 22:53:23 +00001540/* --- UTF-8 Codec -------------------------------------------------------- */
1541
Tim Petersced69f82003-09-16 20:30:58 +00001542static
Guido van Rossumd57fd912000-03-10 22:53:23 +00001543char utf8_code_length[256] = {
1544 /* Map UTF-8 encoded prefix byte to sequence length. zero means
1545 illegal prefix. see RFC 2279 for details */
1546 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1547 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1548 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1549 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1550 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1551 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1552 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1553 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1554 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1555 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1556 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1557 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1558 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1559 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1560 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
1561 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0
1562};
1563
Guido van Rossumd57fd912000-03-10 22:53:23 +00001564PyObject *PyUnicode_DecodeUTF8(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001565 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00001566 const char *errors)
1567{
Walter Dörwald69652032004-09-07 20:24:22 +00001568 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
1569}
1570
1571PyObject *PyUnicode_DecodeUTF8Stateful(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001572 Py_ssize_t size,
Walter Dörwald69652032004-09-07 20:24:22 +00001573 const char *errors,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001574 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00001575{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001576 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001577 int n;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001578 Py_ssize_t startinpos;
1579 Py_ssize_t endinpos;
1580 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001581 const char *e;
1582 PyUnicodeObject *unicode;
1583 Py_UNICODE *p;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001584 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001585 PyObject *errorHandler = NULL;
1586 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001587
1588 /* Note: size will always be longer than the resulting Unicode
1589 character count */
1590 unicode = _PyUnicode_New(size);
1591 if (!unicode)
1592 return NULL;
Walter Dörwald69652032004-09-07 20:24:22 +00001593 if (size == 0) {
1594 if (consumed)
1595 *consumed = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001596 return (PyObject *)unicode;
Walter Dörwald69652032004-09-07 20:24:22 +00001597 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001598
1599 /* Unpack UTF-8 encoded data */
1600 p = unicode->str;
1601 e = s + size;
1602
1603 while (s < e) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001604 Py_UCS4 ch = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001605
1606 if (ch < 0x80) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001607 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001608 s++;
1609 continue;
1610 }
1611
1612 n = utf8_code_length[ch];
1613
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001614 if (s + n > e) {
Walter Dörwald69652032004-09-07 20:24:22 +00001615 if (consumed)
1616 break;
1617 else {
1618 errmsg = "unexpected end of data";
1619 startinpos = s-starts;
1620 endinpos = size;
1621 goto utf8Error;
1622 }
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001623 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001624
1625 switch (n) {
1626
1627 case 0:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001628 errmsg = "unexpected code byte";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001629 startinpos = s-starts;
1630 endinpos = startinpos+1;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001631 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001632
1633 case 1:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001634 errmsg = "internal error";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001635 startinpos = s-starts;
1636 endinpos = startinpos+1;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001637 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001638
1639 case 2:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001640 if ((s[1] & 0xc0) != 0x80) {
1641 errmsg = "invalid data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001642 startinpos = s-starts;
1643 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001644 goto utf8Error;
1645 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001646 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001647 if (ch < 0x80) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001648 startinpos = s-starts;
1649 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001650 errmsg = "illegal encoding";
1651 goto utf8Error;
1652 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001653 else
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001654 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001655 break;
1656
1657 case 3:
Tim Petersced69f82003-09-16 20:30:58 +00001658 if ((s[1] & 0xc0) != 0x80 ||
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001659 (s[2] & 0xc0) != 0x80) {
1660 errmsg = "invalid data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001661 startinpos = s-starts;
1662 endinpos = startinpos+3;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001663 goto utf8Error;
1664 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001665 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001666 if (ch < 0x0800) {
1667 /* Note: UTF-8 encodings of surrogates are considered
Tim Petersced69f82003-09-16 20:30:58 +00001668 legal UTF-8 sequences;
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001669
1670 XXX For wide builds (UCS-4) we should probably try
1671 to recombine the surrogates into a single code
1672 unit.
1673 */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001674 errmsg = "illegal encoding";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001675 startinpos = s-starts;
1676 endinpos = startinpos+3;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001677 goto utf8Error;
1678 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001679 else
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001680 *p++ = (Py_UNICODE)ch;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001681 break;
1682
1683 case 4:
1684 if ((s[1] & 0xc0) != 0x80 ||
1685 (s[2] & 0xc0) != 0x80 ||
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001686 (s[3] & 0xc0) != 0x80) {
1687 errmsg = "invalid data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001688 startinpos = s-starts;
1689 endinpos = startinpos+4;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001690 goto utf8Error;
1691 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001692 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
1693 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
1694 /* validate and convert to UTF-16 */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001695 if ((ch < 0x10000) /* minimum value allowed for 4
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001696 byte encoding */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001697 || (ch > 0x10ffff)) /* maximum value allowed for
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001698 UTF-16 */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001699 {
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001700 errmsg = "illegal encoding";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001701 startinpos = s-starts;
1702 endinpos = startinpos+4;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001703 goto utf8Error;
1704 }
Fredrik Lundh8f455852001-06-27 18:59:43 +00001705#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001706 *p++ = (Py_UNICODE)ch;
1707#else
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001708 /* compute and append the two surrogates: */
Tim Petersced69f82003-09-16 20:30:58 +00001709
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001710 /* translate from 10000..10FFFF to 0..FFFF */
1711 ch -= 0x10000;
Tim Petersced69f82003-09-16 20:30:58 +00001712
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001713 /* high surrogate = top 10 bits added to D800 */
1714 *p++ = (Py_UNICODE)(0xD800 + (ch >> 10));
Tim Petersced69f82003-09-16 20:30:58 +00001715
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001716 /* low surrogate = bottom 10 bits added to DC00 */
Fredrik Lundh45714e92001-06-26 16:39:36 +00001717 *p++ = (Py_UNICODE)(0xDC00 + (ch & 0x03FF));
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001718#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00001719 break;
1720
1721 default:
1722 /* Other sizes are only needed for UCS-4 */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001723 errmsg = "unsupported Unicode code range";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001724 startinpos = s-starts;
1725 endinpos = startinpos+n;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001726 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001727 }
1728 s += n;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001729 continue;
Tim Petersced69f82003-09-16 20:30:58 +00001730
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001731 utf8Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001732 outpos = p-PyUnicode_AS_UNICODE(unicode);
1733 if (unicode_decode_call_errorhandler(
1734 errors, &errorHandler,
1735 "utf8", errmsg,
1736 starts, size, &startinpos, &endinpos, &exc, &s,
1737 (PyObject **)&unicode, &outpos, &p))
1738 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001739 }
Walter Dörwald69652032004-09-07 20:24:22 +00001740 if (consumed)
1741 *consumed = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001742
1743 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00001744 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001745 goto onError;
1746
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001747 Py_XDECREF(errorHandler);
1748 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001749 return (PyObject *)unicode;
1750
1751onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001752 Py_XDECREF(errorHandler);
1753 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001754 Py_DECREF(unicode);
1755 return NULL;
1756}
1757
Tim Peters602f7402002-04-27 18:03:26 +00001758/* Allocation strategy: if the string is short, convert into a stack buffer
1759 and allocate exactly as much space needed at the end. Else allocate the
1760 maximum possible needed (4 result bytes per Unicode character), and return
1761 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001762*/
Tim Peters7e3d9612002-04-21 03:26:37 +00001763PyObject *
1764PyUnicode_EncodeUTF8(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001765 Py_ssize_t size,
Tim Peters7e3d9612002-04-21 03:26:37 +00001766 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001767{
Tim Peters602f7402002-04-27 18:03:26 +00001768#define MAX_SHORT_UNICHARS 300 /* largest size we'll do on the stack */
Tim Peters0eca65c2002-04-21 17:28:06 +00001769
Martin v. Löwis18e16552006-02-15 17:27:45 +00001770 Py_ssize_t i; /* index into s of next input byte */
Tim Peters602f7402002-04-27 18:03:26 +00001771 PyObject *v; /* result string object */
1772 char *p; /* next free byte in output buffer */
Martin v. Löwis18e16552006-02-15 17:27:45 +00001773 Py_ssize_t nallocated; /* number of result bytes allocated */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001774 Py_ssize_t nneeded; /* number of result bytes needed */
Tim Peters602f7402002-04-27 18:03:26 +00001775 char stackbuf[MAX_SHORT_UNICHARS * 4];
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001776
Tim Peters602f7402002-04-27 18:03:26 +00001777 assert(s != NULL);
1778 assert(size >= 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001779
Tim Peters602f7402002-04-27 18:03:26 +00001780 if (size <= MAX_SHORT_UNICHARS) {
1781 /* Write into the stack buffer; nallocated can't overflow.
1782 * At the end, we'll allocate exactly as much heap space as it
1783 * turns out we need.
1784 */
1785 nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int);
1786 v = NULL; /* will allocate after we're done */
1787 p = stackbuf;
1788 }
1789 else {
1790 /* Overallocate on the heap, and give the excess back at the end. */
1791 nallocated = size * 4;
1792 if (nallocated / 4 != size) /* overflow! */
1793 return PyErr_NoMemory();
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001794 v = PyBytes_FromStringAndSize(NULL, nallocated);
Tim Peters602f7402002-04-27 18:03:26 +00001795 if (v == NULL)
1796 return NULL;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001797 p = PyBytes_AS_STRING(v);
Tim Peters602f7402002-04-27 18:03:26 +00001798 }
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001799
Tim Peters602f7402002-04-27 18:03:26 +00001800 for (i = 0; i < size;) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001801 Py_UCS4 ch = s[i++];
Marc-André Lemburg3688a882002-02-06 18:09:02 +00001802
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001803 if (ch < 0x80)
Tim Peters602f7402002-04-27 18:03:26 +00001804 /* Encode ASCII */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001805 *p++ = (char) ch;
Marc-André Lemburg3688a882002-02-06 18:09:02 +00001806
Guido van Rossumd57fd912000-03-10 22:53:23 +00001807 else if (ch < 0x0800) {
Tim Peters602f7402002-04-27 18:03:26 +00001808 /* Encode Latin-1 */
Marc-André Lemburgdc724d62002-02-06 18:20:19 +00001809 *p++ = (char)(0xc0 | (ch >> 6));
1810 *p++ = (char)(0x80 | (ch & 0x3f));
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001811 }
Marc-André Lemburg3688a882002-02-06 18:09:02 +00001812 else {
Tim Peters602f7402002-04-27 18:03:26 +00001813 /* Encode UCS2 Unicode ordinals */
1814 if (ch < 0x10000) {
1815 /* Special case: check for high surrogate */
1816 if (0xD800 <= ch && ch <= 0xDBFF && i != size) {
1817 Py_UCS4 ch2 = s[i];
1818 /* Check for low surrogate and combine the two to
1819 form a UCS4 value */
1820 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001821 ch = ((ch - 0xD800) << 10 | (ch2 - 0xDC00)) + 0x10000;
Tim Peters602f7402002-04-27 18:03:26 +00001822 i++;
1823 goto encodeUCS4;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001824 }
Tim Peters602f7402002-04-27 18:03:26 +00001825 /* Fall through: handles isolated high surrogates */
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001826 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001827 *p++ = (char)(0xe0 | (ch >> 12));
Tim Peters602f7402002-04-27 18:03:26 +00001828 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
1829 *p++ = (char)(0x80 | (ch & 0x3f));
1830 continue;
1831 }
1832encodeUCS4:
1833 /* Encode UCS4 Unicode ordinals */
1834 *p++ = (char)(0xf0 | (ch >> 18));
1835 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
1836 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
1837 *p++ = (char)(0x80 | (ch & 0x3f));
1838 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001839 }
Tim Peters0eca65c2002-04-21 17:28:06 +00001840
Tim Peters602f7402002-04-27 18:03:26 +00001841 if (v == NULL) {
1842 /* This was stack allocated. */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001843 nneeded = p - stackbuf;
Tim Peters602f7402002-04-27 18:03:26 +00001844 assert(nneeded <= nallocated);
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001845 v = PyBytes_FromStringAndSize(stackbuf, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00001846 }
1847 else {
1848 /* Cut back to size actually needed. */
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001849 nneeded = p - PyBytes_AS_STRING(v);
Tim Peters602f7402002-04-27 18:03:26 +00001850 assert(nneeded <= nallocated);
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001851 PyBytes_Resize(v, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00001852 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001853 return v;
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001854
Tim Peters602f7402002-04-27 18:03:26 +00001855#undef MAX_SHORT_UNICHARS
Guido van Rossumd57fd912000-03-10 22:53:23 +00001856}
1857
Guido van Rossumd57fd912000-03-10 22:53:23 +00001858PyObject *PyUnicode_AsUTF8String(PyObject *unicode)
1859{
Guido van Rossumd57fd912000-03-10 22:53:23 +00001860 if (!PyUnicode_Check(unicode)) {
1861 PyErr_BadArgument();
1862 return NULL;
1863 }
Barry Warsaw2dd4abf2000-08-18 06:58:15 +00001864 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
1865 PyUnicode_GET_SIZE(unicode),
1866 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001867}
1868
1869/* --- UTF-16 Codec ------------------------------------------------------- */
1870
Tim Peters772747b2001-08-09 22:21:55 +00001871PyObject *
1872PyUnicode_DecodeUTF16(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001873 Py_ssize_t size,
Tim Peters772747b2001-08-09 22:21:55 +00001874 const char *errors,
1875 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001876{
Walter Dörwald69652032004-09-07 20:24:22 +00001877 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
1878}
1879
1880PyObject *
1881PyUnicode_DecodeUTF16Stateful(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001882 Py_ssize_t size,
Walter Dörwald69652032004-09-07 20:24:22 +00001883 const char *errors,
1884 int *byteorder,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001885 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00001886{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001887 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001888 Py_ssize_t startinpos;
1889 Py_ssize_t endinpos;
1890 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001891 PyUnicodeObject *unicode;
1892 Py_UNICODE *p;
Tim Peters772747b2001-08-09 22:21:55 +00001893 const unsigned char *q, *e;
1894 int bo = 0; /* assume native ordering by default */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001895 const char *errmsg = "";
Tim Peters772747b2001-08-09 22:21:55 +00001896 /* Offsets from q for retrieving byte pairs in the right order. */
1897#ifdef BYTEORDER_IS_LITTLE_ENDIAN
1898 int ihi = 1, ilo = 0;
1899#else
1900 int ihi = 0, ilo = 1;
1901#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001902 PyObject *errorHandler = NULL;
1903 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001904
1905 /* Note: size will always be longer than the resulting Unicode
1906 character count */
1907 unicode = _PyUnicode_New(size);
1908 if (!unicode)
1909 return NULL;
1910 if (size == 0)
1911 return (PyObject *)unicode;
1912
1913 /* Unpack UTF-16 encoded data */
1914 p = unicode->str;
Tim Peters772747b2001-08-09 22:21:55 +00001915 q = (unsigned char *)s;
1916 e = q + size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001917
1918 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00001919 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001920
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001921 /* Check for BOM marks (U+FEFF) in the input and adjust current
1922 byte order setting accordingly. In native mode, the leading BOM
1923 mark is skipped, in all other modes, it is copied to the output
1924 stream as-is (giving a ZWNBSP character). */
1925 if (bo == 0) {
Walter Dörwald69652032004-09-07 20:24:22 +00001926 if (size >= 2) {
1927 const Py_UNICODE bom = (q[ihi] << 8) | q[ilo];
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001928#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Walter Dörwald69652032004-09-07 20:24:22 +00001929 if (bom == 0xFEFF) {
1930 q += 2;
1931 bo = -1;
1932 }
1933 else if (bom == 0xFFFE) {
1934 q += 2;
1935 bo = 1;
1936 }
Tim Petersced69f82003-09-16 20:30:58 +00001937#else
Walter Dörwald69652032004-09-07 20:24:22 +00001938 if (bom == 0xFEFF) {
1939 q += 2;
1940 bo = 1;
1941 }
1942 else if (bom == 0xFFFE) {
1943 q += 2;
1944 bo = -1;
1945 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001946#endif
Walter Dörwald69652032004-09-07 20:24:22 +00001947 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001948 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001949
Tim Peters772747b2001-08-09 22:21:55 +00001950 if (bo == -1) {
1951 /* force LE */
1952 ihi = 1;
1953 ilo = 0;
1954 }
1955 else if (bo == 1) {
1956 /* force BE */
1957 ihi = 0;
1958 ilo = 1;
1959 }
1960
1961 while (q < e) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001962 Py_UNICODE ch;
Walter Dörwald69652032004-09-07 20:24:22 +00001963 /* remaining bytes at the end? (size should be even) */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001964 if (e-q<2) {
Walter Dörwald69652032004-09-07 20:24:22 +00001965 if (consumed)
1966 break;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001967 errmsg = "truncated data";
1968 startinpos = ((const char *)q)-starts;
1969 endinpos = ((const char *)e)-starts;
1970 goto utf16Error;
1971 /* The remaining input chars are ignored if the callback
1972 chooses to skip the input */
1973 }
1974 ch = (q[ihi] << 8) | q[ilo];
1975
Tim Peters772747b2001-08-09 22:21:55 +00001976 q += 2;
1977
Guido van Rossumd57fd912000-03-10 22:53:23 +00001978 if (ch < 0xD800 || ch > 0xDFFF) {
1979 *p++ = ch;
1980 continue;
1981 }
1982
1983 /* UTF-16 code pair: */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001984 if (q >= e) {
1985 errmsg = "unexpected end of data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001986 startinpos = (((const char *)q)-2)-starts;
1987 endinpos = ((const char *)e)-starts;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001988 goto utf16Error;
1989 }
Martin v. Löwisac93bc22001-06-26 22:43:40 +00001990 if (0xD800 <= ch && ch <= 0xDBFF) {
Tim Peters772747b2001-08-09 22:21:55 +00001991 Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo];
1992 q += 2;
Martin v. Löwisac93bc22001-06-26 22:43:40 +00001993 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Fredrik Lundh8f455852001-06-27 18:59:43 +00001994#ifndef Py_UNICODE_WIDE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001995 *p++ = ch;
1996 *p++ = ch2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001997#else
1998 *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001999#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002000 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002001 }
2002 else {
2003 errmsg = "illegal UTF-16 surrogate";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002004 startinpos = (((const char *)q)-4)-starts;
2005 endinpos = startinpos+2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002006 goto utf16Error;
2007 }
2008
Guido van Rossumd57fd912000-03-10 22:53:23 +00002009 }
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002010 errmsg = "illegal encoding";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002011 startinpos = (((const char *)q)-2)-starts;
2012 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002013 /* Fall through to report the error */
2014
2015 utf16Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002016 outpos = p-PyUnicode_AS_UNICODE(unicode);
2017 if (unicode_decode_call_errorhandler(
2018 errors, &errorHandler,
2019 "utf16", errmsg,
2020 starts, size, &startinpos, &endinpos, &exc, (const char **)&q,
2021 (PyObject **)&unicode, &outpos, &p))
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002022 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002023 }
2024
2025 if (byteorder)
2026 *byteorder = bo;
2027
Walter Dörwald69652032004-09-07 20:24:22 +00002028 if (consumed)
2029 *consumed = (const char *)q-starts;
2030
Guido van Rossumd57fd912000-03-10 22:53:23 +00002031 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00002032 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002033 goto onError;
2034
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002035 Py_XDECREF(errorHandler);
2036 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002037 return (PyObject *)unicode;
2038
2039onError:
2040 Py_DECREF(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002041 Py_XDECREF(errorHandler);
2042 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002043 return NULL;
2044}
2045
Tim Peters772747b2001-08-09 22:21:55 +00002046PyObject *
2047PyUnicode_EncodeUTF16(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002048 Py_ssize_t size,
Tim Peters772747b2001-08-09 22:21:55 +00002049 const char *errors,
2050 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002051{
2052 PyObject *v;
Tim Peters772747b2001-08-09 22:21:55 +00002053 unsigned char *p;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002054#ifdef Py_UNICODE_WIDE
Tim Peters772747b2001-08-09 22:21:55 +00002055 int i, pairs;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002056#else
2057 const int pairs = 0;
2058#endif
Tim Peters772747b2001-08-09 22:21:55 +00002059 /* Offsets from p for storing byte pairs in the right order. */
2060#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2061 int ihi = 1, ilo = 0;
2062#else
2063 int ihi = 0, ilo = 1;
2064#endif
2065
2066#define STORECHAR(CH) \
2067 do { \
2068 p[ihi] = ((CH) >> 8) & 0xff; \
2069 p[ilo] = (CH) & 0xff; \
2070 p += 2; \
2071 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002072
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002073#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002074 for (i = pairs = 0; i < size; i++)
2075 if (s[i] >= 0x10000)
2076 pairs++;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002077#endif
Walter Dörwald3cc34522007-05-04 10:48:27 +00002078 v = PyBytes_FromStringAndSize(NULL,
Tim Peters772747b2001-08-09 22:21:55 +00002079 2 * (size + pairs + (byteorder == 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +00002080 if (v == NULL)
2081 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002082
Walter Dörwald3cc34522007-05-04 10:48:27 +00002083 p = (unsigned char *)PyBytes_AS_STRING(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002084 if (byteorder == 0)
Tim Peters772747b2001-08-09 22:21:55 +00002085 STORECHAR(0xFEFF);
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00002086 if (size == 0)
Marc-André Lemburgb7520772000-08-14 11:29:19 +00002087 return v;
Tim Peters772747b2001-08-09 22:21:55 +00002088
2089 if (byteorder == -1) {
2090 /* force LE */
2091 ihi = 1;
2092 ilo = 0;
2093 }
2094 else if (byteorder == 1) {
2095 /* force BE */
2096 ihi = 0;
2097 ilo = 1;
2098 }
2099
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002100 while (size-- > 0) {
2101 Py_UNICODE ch = *s++;
2102 Py_UNICODE ch2 = 0;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002103#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002104 if (ch >= 0x10000) {
Tim Peters772747b2001-08-09 22:21:55 +00002105 ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF);
2106 ch = 0xD800 | ((ch-0x10000) >> 10);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002107 }
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002108#endif
Tim Peters772747b2001-08-09 22:21:55 +00002109 STORECHAR(ch);
2110 if (ch2)
2111 STORECHAR(ch2);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002112 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002113 return v;
Tim Peters772747b2001-08-09 22:21:55 +00002114#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00002115}
2116
2117PyObject *PyUnicode_AsUTF16String(PyObject *unicode)
2118{
2119 if (!PyUnicode_Check(unicode)) {
2120 PyErr_BadArgument();
2121 return NULL;
2122 }
2123 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
2124 PyUnicode_GET_SIZE(unicode),
2125 NULL,
2126 0);
2127}
2128
2129/* --- Unicode Escape Codec ----------------------------------------------- */
2130
Fredrik Lundh06d12682001-01-24 07:59:11 +00002131static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00002132
Guido van Rossumd57fd912000-03-10 22:53:23 +00002133PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002134 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00002135 const char *errors)
2136{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002137 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002138 Py_ssize_t startinpos;
2139 Py_ssize_t endinpos;
2140 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002141 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002142 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002143 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002144 const char *end;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002145 char* message;
2146 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002147 PyObject *errorHandler = NULL;
2148 PyObject *exc = NULL;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002149
Guido van Rossumd57fd912000-03-10 22:53:23 +00002150 /* Escaped strings will always be longer than the resulting
2151 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002152 length after conversion to the true value.
2153 (but if the error callback returns a long replacement string
2154 we'll have to allocate more space) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002155 v = _PyUnicode_New(size);
2156 if (v == NULL)
2157 goto onError;
2158 if (size == 0)
2159 return (PyObject *)v;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002160
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002161 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002162 end = s + size;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002163
Guido van Rossumd57fd912000-03-10 22:53:23 +00002164 while (s < end) {
2165 unsigned char c;
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00002166 Py_UNICODE x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002167 int digits;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002168
2169 /* Non-escape characters are interpreted as Unicode ordinals */
2170 if (*s != '\\') {
Fredrik Lundhccc74732001-02-18 22:13:49 +00002171 *p++ = (unsigned char) *s++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002172 continue;
2173 }
2174
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002175 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002176 /* \ - Escapes */
2177 s++;
2178 switch (*s++) {
2179
2180 /* \x escapes */
2181 case '\n': break;
2182 case '\\': *p++ = '\\'; break;
2183 case '\'': *p++ = '\''; break;
2184 case '\"': *p++ = '\"'; break;
2185 case 'b': *p++ = '\b'; break;
2186 case 'f': *p++ = '\014'; break; /* FF */
2187 case 't': *p++ = '\t'; break;
2188 case 'n': *p++ = '\n'; break;
2189 case 'r': *p++ = '\r'; break;
2190 case 'v': *p++ = '\013'; break; /* VT */
2191 case 'a': *p++ = '\007'; break; /* BEL, not classic C */
2192
2193 /* \OOO (octal) escapes */
2194 case '0': case '1': case '2': case '3':
2195 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00002196 x = s[-1] - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00002197 if ('0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00002198 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00002199 if ('0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00002200 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00002201 }
Guido van Rossum0e4f6572000-05-01 21:27:20 +00002202 *p++ = x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002203 break;
2204
Fredrik Lundhccc74732001-02-18 22:13:49 +00002205 /* hex escapes */
2206 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002207 case 'x':
Fredrik Lundhccc74732001-02-18 22:13:49 +00002208 digits = 2;
2209 message = "truncated \\xXX escape";
2210 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002211
Fredrik Lundhccc74732001-02-18 22:13:49 +00002212 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002213 case 'u':
Fredrik Lundhccc74732001-02-18 22:13:49 +00002214 digits = 4;
2215 message = "truncated \\uXXXX escape";
2216 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002217
Fredrik Lundhccc74732001-02-18 22:13:49 +00002218 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00002219 case 'U':
Fredrik Lundhccc74732001-02-18 22:13:49 +00002220 digits = 8;
2221 message = "truncated \\UXXXXXXXX escape";
2222 hexescape:
2223 chr = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002224 outpos = p-PyUnicode_AS_UNICODE(v);
2225 if (s+digits>end) {
2226 endinpos = size;
2227 if (unicode_decode_call_errorhandler(
2228 errors, &errorHandler,
2229 "unicodeescape", "end of string in escape sequence",
2230 starts, size, &startinpos, &endinpos, &exc, &s,
2231 (PyObject **)&v, &outpos, &p))
2232 goto onError;
2233 goto nextByte;
2234 }
2235 for (i = 0; i < digits; ++i) {
Fredrik Lundhccc74732001-02-18 22:13:49 +00002236 c = (unsigned char) s[i];
Fredrik Lundhdf846752000-09-03 11:29:49 +00002237 if (!isxdigit(c)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002238 endinpos = (s+i+1)-starts;
2239 if (unicode_decode_call_errorhandler(
2240 errors, &errorHandler,
2241 "unicodeescape", message,
2242 starts, size, &startinpos, &endinpos, &exc, &s,
2243 (PyObject **)&v, &outpos, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00002244 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002245 goto nextByte;
Fredrik Lundhdf846752000-09-03 11:29:49 +00002246 }
2247 chr = (chr<<4) & ~0xF;
2248 if (c >= '0' && c <= '9')
2249 chr += c - '0';
2250 else if (c >= 'a' && c <= 'f')
2251 chr += 10 + c - 'a';
2252 else
2253 chr += 10 + c - 'A';
2254 }
2255 s += i;
Jeremy Hylton504de6b2003-10-06 05:08:26 +00002256 if (chr == 0xffffffff && PyErr_Occurred())
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002257 /* _decoding_error will have already written into the
2258 target buffer. */
2259 break;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002260 store:
Fredrik Lundhdf846752000-09-03 11:29:49 +00002261 /* when we get here, chr is a 32-bit unicode character */
2262 if (chr <= 0xffff)
2263 /* UCS-2 character */
2264 *p++ = (Py_UNICODE) chr;
2265 else if (chr <= 0x10ffff) {
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002266 /* UCS-4 character. Either store directly, or as
Walter Dörwald8c077222002-03-25 11:16:18 +00002267 surrogate pair. */
Fredrik Lundh8f455852001-06-27 18:59:43 +00002268#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002269 *p++ = chr;
2270#else
Fredrik Lundhdf846752000-09-03 11:29:49 +00002271 chr -= 0x10000L;
2272 *p++ = 0xD800 + (Py_UNICODE) (chr >> 10);
Fredrik Lundh45714e92001-06-26 16:39:36 +00002273 *p++ = 0xDC00 + (Py_UNICODE) (chr & 0x03FF);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002274#endif
Fredrik Lundhdf846752000-09-03 11:29:49 +00002275 } else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002276 endinpos = s-starts;
2277 outpos = p-PyUnicode_AS_UNICODE(v);
2278 if (unicode_decode_call_errorhandler(
2279 errors, &errorHandler,
2280 "unicodeescape", "illegal Unicode character",
2281 starts, size, &startinpos, &endinpos, &exc, &s,
2282 (PyObject **)&v, &outpos, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00002283 goto onError;
2284 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00002285 break;
2286
2287 /* \N{name} */
2288 case 'N':
2289 message = "malformed \\N character escape";
2290 if (ucnhash_CAPI == NULL) {
2291 /* load the unicode data module */
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00002292 PyObject *m, *api;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002293 m = PyImport_ImportModule("unicodedata");
2294 if (m == NULL)
2295 goto ucnhashError;
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00002296 api = PyObject_GetAttrString(m, "ucnhash_CAPI");
Fredrik Lundhccc74732001-02-18 22:13:49 +00002297 Py_DECREF(m);
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00002298 if (api == NULL)
Fredrik Lundhccc74732001-02-18 22:13:49 +00002299 goto ucnhashError;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002300 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCObject_AsVoidPtr(api);
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00002301 Py_DECREF(api);
Fredrik Lundhccc74732001-02-18 22:13:49 +00002302 if (ucnhash_CAPI == NULL)
2303 goto ucnhashError;
2304 }
2305 if (*s == '{') {
2306 const char *start = s+1;
2307 /* look for the closing brace */
2308 while (*s != '}' && s < end)
2309 s++;
2310 if (s > start && s < end && *s == '}') {
2311 /* found a name. look it up in the unicode database */
2312 message = "unknown Unicode character name";
2313 s++;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +00002314 if (ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1), &chr))
Fredrik Lundhccc74732001-02-18 22:13:49 +00002315 goto store;
2316 }
2317 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002318 endinpos = s-starts;
2319 outpos = p-PyUnicode_AS_UNICODE(v);
2320 if (unicode_decode_call_errorhandler(
2321 errors, &errorHandler,
2322 "unicodeescape", message,
2323 starts, size, &startinpos, &endinpos, &exc, &s,
2324 (PyObject **)&v, &outpos, &p))
Fredrik Lundhccc74732001-02-18 22:13:49 +00002325 goto onError;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002326 break;
2327
2328 default:
Walter Dörwald8c077222002-03-25 11:16:18 +00002329 if (s > end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002330 message = "\\ at end of string";
2331 s--;
2332 endinpos = s-starts;
2333 outpos = p-PyUnicode_AS_UNICODE(v);
2334 if (unicode_decode_call_errorhandler(
2335 errors, &errorHandler,
2336 "unicodeescape", message,
2337 starts, size, &startinpos, &endinpos, &exc, &s,
2338 (PyObject **)&v, &outpos, &p))
Walter Dörwald8c077222002-03-25 11:16:18 +00002339 goto onError;
2340 }
2341 else {
2342 *p++ = '\\';
2343 *p++ = (unsigned char)s[-1];
2344 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00002345 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002346 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002347 nextByte:
2348 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002349 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002350 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002351 goto onError;
Walter Dörwaldd4ade082003-08-15 15:00:26 +00002352 Py_XDECREF(errorHandler);
2353 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002354 return (PyObject *)v;
Walter Dörwald8c077222002-03-25 11:16:18 +00002355
Fredrik Lundhccc74732001-02-18 22:13:49 +00002356ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00002357 PyErr_SetString(
2358 PyExc_UnicodeError,
2359 "\\N escapes not supported (can't load unicodedata module)"
2360 );
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00002361 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002362 Py_XDECREF(errorHandler);
2363 Py_XDECREF(exc);
Fredrik Lundhf6056062001-01-20 11:15:25 +00002364 return NULL;
2365
Fredrik Lundhccc74732001-02-18 22:13:49 +00002366onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00002367 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002368 Py_XDECREF(errorHandler);
2369 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002370 return NULL;
2371}
2372
2373/* Return a Unicode-Escape string version of the Unicode object.
2374
2375 If quotes is true, the string is enclosed in u"" or u'' quotes as
2376 appropriate.
2377
2378*/
2379
Thomas Wouters477c8d52006-05-27 19:21:47 +00002380Py_LOCAL_INLINE(const Py_UNICODE *) findchar(const Py_UNICODE *s,
2381 Py_ssize_t size,
2382 Py_UNICODE ch)
2383{
2384 /* like wcschr, but doesn't stop at NULL characters */
2385
2386 while (size-- > 0) {
2387 if (*s == ch)
2388 return s;
2389 s++;
2390 }
2391
2392 return NULL;
2393}
Barry Warsaw51ac5802000-03-20 16:36:48 +00002394
Walter Dörwald79e913e2007-05-12 11:08:06 +00002395static const char *hexdigits = "0123456789abcdef";
2396
2397PyObject *PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
2398 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002399{
2400 PyObject *repr;
2401 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002402
Thomas Wouters89f507f2006-12-13 04:49:30 +00002403 /* XXX(nnorwitz): rather than over-allocating, it would be
2404 better to choose a different scheme. Perhaps scan the
2405 first N-chars of the string and allocate based on that size.
2406 */
2407 /* Initial allocation is based on the longest-possible unichr
2408 escape.
2409
2410 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
2411 unichr, so in this case it's the longest unichr escape. In
2412 narrow (UTF-16) builds this is five chars per source unichr
2413 since there are two unichrs in the surrogate pair, so in narrow
2414 (UTF-16) builds it's not the longest unichr escape.
2415
2416 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
2417 so in the narrow (UTF-16) build case it's the longest unichr
2418 escape.
2419 */
2420
Walter Dörwald79e913e2007-05-12 11:08:06 +00002421 repr = PyBytes_FromStringAndSize(NULL,
Thomas Wouters89f507f2006-12-13 04:49:30 +00002422#ifdef Py_UNICODE_WIDE
2423 + 10*size
2424#else
2425 + 6*size
2426#endif
2427 + 1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002428 if (repr == NULL)
2429 return NULL;
2430
Walter Dörwald79e913e2007-05-12 11:08:06 +00002431 p = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002432
Guido van Rossumd57fd912000-03-10 22:53:23 +00002433 while (size-- > 0) {
2434 Py_UNICODE ch = *s++;
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002435
Walter Dörwald79e913e2007-05-12 11:08:06 +00002436 /* Escape backslashes */
2437 if (ch == '\\') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002438 *p++ = '\\';
2439 *p++ = (char) ch;
Walter Dörwald79e913e2007-05-12 11:08:06 +00002440 continue;
Tim Petersced69f82003-09-16 20:30:58 +00002441 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002442
Guido van Rossum0d42e0c2001-07-20 16:36:21 +00002443#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002444 /* Map 21-bit characters to '\U00xxxxxx' */
2445 else if (ch >= 0x10000) {
2446 *p++ = '\\';
2447 *p++ = 'U';
Walter Dörwald79e913e2007-05-12 11:08:06 +00002448 *p++ = hexdigits[(ch >> 28) & 0x0000000F];
2449 *p++ = hexdigits[(ch >> 24) & 0x0000000F];
2450 *p++ = hexdigits[(ch >> 20) & 0x0000000F];
2451 *p++ = hexdigits[(ch >> 16) & 0x0000000F];
2452 *p++ = hexdigits[(ch >> 12) & 0x0000000F];
2453 *p++ = hexdigits[(ch >> 8) & 0x0000000F];
2454 *p++ = hexdigits[(ch >> 4) & 0x0000000F];
2455 *p++ = hexdigits[ch & 0x0000000F];
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002456 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002457 }
Thomas Wouters89f507f2006-12-13 04:49:30 +00002458#else
2459 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002460 else if (ch >= 0xD800 && ch < 0xDC00) {
2461 Py_UNICODE ch2;
2462 Py_UCS4 ucs;
Tim Petersced69f82003-09-16 20:30:58 +00002463
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002464 ch2 = *s++;
2465 size--;
2466 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
2467 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
2468 *p++ = '\\';
2469 *p++ = 'U';
Walter Dörwald79e913e2007-05-12 11:08:06 +00002470 *p++ = hexdigits[(ucs >> 28) & 0x0000000F];
2471 *p++ = hexdigits[(ucs >> 24) & 0x0000000F];
2472 *p++ = hexdigits[(ucs >> 20) & 0x0000000F];
2473 *p++ = hexdigits[(ucs >> 16) & 0x0000000F];
2474 *p++ = hexdigits[(ucs >> 12) & 0x0000000F];
2475 *p++ = hexdigits[(ucs >> 8) & 0x0000000F];
2476 *p++ = hexdigits[(ucs >> 4) & 0x0000000F];
2477 *p++ = hexdigits[ucs & 0x0000000F];
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002478 continue;
2479 }
2480 /* Fall through: isolated surrogates are copied as-is */
2481 s--;
2482 size++;
2483 }
Thomas Wouters89f507f2006-12-13 04:49:30 +00002484#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002485
Guido van Rossumd57fd912000-03-10 22:53:23 +00002486 /* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002487 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002488 *p++ = '\\';
2489 *p++ = 'u';
Walter Dörwald79e913e2007-05-12 11:08:06 +00002490 *p++ = hexdigits[(ch >> 12) & 0x000F];
2491 *p++ = hexdigits[(ch >> 8) & 0x000F];
2492 *p++ = hexdigits[(ch >> 4) & 0x000F];
2493 *p++ = hexdigits[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00002494 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002495
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00002496 /* Map special whitespace to '\t', \n', '\r' */
2497 else if (ch == '\t') {
2498 *p++ = '\\';
2499 *p++ = 't';
2500 }
2501 else if (ch == '\n') {
2502 *p++ = '\\';
2503 *p++ = 'n';
2504 }
2505 else if (ch == '\r') {
2506 *p++ = '\\';
2507 *p++ = 'r';
2508 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002509
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00002510 /* Map non-printable US ASCII to '\xhh' */
Marc-André Lemburg11326de2001-11-28 12:56:20 +00002511 else if (ch < ' ' || ch >= 0x7F) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002512 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00002513 *p++ = 'x';
Walter Dörwald79e913e2007-05-12 11:08:06 +00002514 *p++ = hexdigits[(ch >> 4) & 0x000F];
2515 *p++ = hexdigits[ch & 0x000F];
Tim Petersced69f82003-09-16 20:30:58 +00002516 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002517
Guido van Rossumd57fd912000-03-10 22:53:23 +00002518 /* Copy everything else as-is */
2519 else
2520 *p++ = (char) ch;
2521 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002522
2523 *p = '\0';
Walter Dörwald79e913e2007-05-12 11:08:06 +00002524 if (PyBytes_Resize(repr, p - PyBytes_AS_STRING(repr))) {
2525 Py_DECREF(repr);
2526 return NULL;
2527 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002528 return repr;
2529}
2530
Guido van Rossumd57fd912000-03-10 22:53:23 +00002531PyObject *PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
2532{
Walter Dörwald79e913e2007-05-12 11:08:06 +00002533 PyObject *s, *result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002534 if (!PyUnicode_Check(unicode)) {
2535 PyErr_BadArgument();
2536 return NULL;
2537 }
Walter Dörwald79e913e2007-05-12 11:08:06 +00002538 s = PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
2539 PyUnicode_GET_SIZE(unicode));
2540
2541 if (!s)
2542 return NULL;
2543 result = PyString_FromStringAndSize(PyBytes_AS_STRING(s),
2544 PyBytes_GET_SIZE(s));
2545 Py_DECREF(s);
2546 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002547}
2548
2549/* --- Raw Unicode Escape Codec ------------------------------------------- */
2550
2551PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002552 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00002553 const char *errors)
2554{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002555 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002556 Py_ssize_t startinpos;
2557 Py_ssize_t endinpos;
2558 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002559 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002560 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002561 const char *end;
2562 const char *bs;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002563 PyObject *errorHandler = NULL;
2564 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00002565
Guido van Rossumd57fd912000-03-10 22:53:23 +00002566 /* Escaped strings will always be longer than the resulting
2567 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002568 length after conversion to the true value. (But decoding error
2569 handler might have to resize the string) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002570 v = _PyUnicode_New(size);
2571 if (v == NULL)
2572 goto onError;
2573 if (size == 0)
2574 return (PyObject *)v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002575 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002576 end = s + size;
2577 while (s < end) {
2578 unsigned char c;
Martin v. Löwis047c05e2002-03-21 08:55:28 +00002579 Py_UCS4 x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002580 int i;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002581 int count;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002582
2583 /* Non-escape characters are interpreted as Unicode ordinals */
2584 if (*s != '\\') {
2585 *p++ = (unsigned char)*s++;
2586 continue;
2587 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002588 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002589
2590 /* \u-escapes are only interpreted iff the number of leading
2591 backslashes if odd */
2592 bs = s;
2593 for (;s < end;) {
2594 if (*s != '\\')
2595 break;
2596 *p++ = (unsigned char)*s++;
2597 }
2598 if (((s - bs) & 1) == 0 ||
2599 s >= end ||
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002600 (*s != 'u' && *s != 'U')) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002601 continue;
2602 }
2603 p--;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002604 count = *s=='u' ? 4 : 8;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002605 s++;
2606
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002607 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002608 outpos = p-PyUnicode_AS_UNICODE(v);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002609 for (x = 0, i = 0; i < count; ++i, ++s) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002610 c = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002611 if (!isxdigit(c)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002612 endinpos = s-starts;
2613 if (unicode_decode_call_errorhandler(
2614 errors, &errorHandler,
2615 "rawunicodeescape", "truncated \\uXXXX",
2616 starts, size, &startinpos, &endinpos, &exc, &s,
2617 (PyObject **)&v, &outpos, &p))
Guido van Rossumd57fd912000-03-10 22:53:23 +00002618 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002619 goto nextByte;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002620 }
2621 x = (x<<4) & ~0xF;
2622 if (c >= '0' && c <= '9')
2623 x += c - '0';
2624 else if (c >= 'a' && c <= 'f')
2625 x += 10 + c - 'a';
2626 else
2627 x += 10 + c - 'A';
2628 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002629#ifndef Py_UNICODE_WIDE
2630 if (x > 0x10000) {
2631 if (unicode_decode_call_errorhandler(
2632 errors, &errorHandler,
2633 "rawunicodeescape", "\\Uxxxxxxxx out of range",
2634 starts, size, &startinpos, &endinpos, &exc, &s,
2635 (PyObject **)&v, &outpos, &p))
2636 goto onError;
2637 }
2638#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002639 *p++ = x;
2640 nextByte:
2641 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002642 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002643 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002644 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002645 Py_XDECREF(errorHandler);
2646 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002647 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00002648
Guido van Rossumd57fd912000-03-10 22:53:23 +00002649 onError:
2650 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002651 Py_XDECREF(errorHandler);
2652 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002653 return NULL;
2654}
2655
2656PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002657 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002658{
2659 PyObject *repr;
2660 char *p;
2661 char *q;
2662
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002663#ifdef Py_UNICODE_WIDE
Walter Dörwald711005d2007-05-12 12:03:26 +00002664 repr = PyBytes_FromStringAndSize(NULL, 10 * size);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002665#else
Walter Dörwald711005d2007-05-12 12:03:26 +00002666 repr = PyBytes_FromStringAndSize(NULL, 6 * size);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002667#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00002668 if (repr == NULL)
2669 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00002670 if (size == 0)
2671 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002672
Walter Dörwald711005d2007-05-12 12:03:26 +00002673 p = q = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002674 while (size-- > 0) {
2675 Py_UNICODE ch = *s++;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002676#ifdef Py_UNICODE_WIDE
2677 /* Map 32-bit characters to '\Uxxxxxxxx' */
2678 if (ch >= 0x10000) {
2679 *p++ = '\\';
2680 *p++ = 'U';
Walter Dörwalddb5d33e2007-05-12 11:13:47 +00002681 *p++ = hexdigits[(ch >> 28) & 0xf];
2682 *p++ = hexdigits[(ch >> 24) & 0xf];
2683 *p++ = hexdigits[(ch >> 20) & 0xf];
2684 *p++ = hexdigits[(ch >> 16) & 0xf];
2685 *p++ = hexdigits[(ch >> 12) & 0xf];
2686 *p++ = hexdigits[(ch >> 8) & 0xf];
2687 *p++ = hexdigits[(ch >> 4) & 0xf];
2688 *p++ = hexdigits[ch & 15];
Tim Petersced69f82003-09-16 20:30:58 +00002689 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002690 else
2691#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00002692 /* Map 16-bit characters to '\uxxxx' */
2693 if (ch >= 256) {
2694 *p++ = '\\';
2695 *p++ = 'u';
Walter Dörwalddb5d33e2007-05-12 11:13:47 +00002696 *p++ = hexdigits[(ch >> 12) & 0xf];
2697 *p++ = hexdigits[(ch >> 8) & 0xf];
2698 *p++ = hexdigits[(ch >> 4) & 0xf];
2699 *p++ = hexdigits[ch & 15];
Guido van Rossumd57fd912000-03-10 22:53:23 +00002700 }
2701 /* Copy everything else as-is */
2702 else
2703 *p++ = (char) ch;
2704 }
2705 *p = '\0';
Walter Dörwald711005d2007-05-12 12:03:26 +00002706 if (PyBytes_Resize(repr, p - q)) {
2707 Py_DECREF(repr);
2708 return NULL;
2709 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002710 return repr;
2711}
2712
2713PyObject *PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
2714{
Walter Dörwald711005d2007-05-12 12:03:26 +00002715 PyObject *s, *result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002716 if (!PyUnicode_Check(unicode)) {
Walter Dörwald711005d2007-05-12 12:03:26 +00002717 PyErr_BadArgument();
2718 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002719 }
Walter Dörwald711005d2007-05-12 12:03:26 +00002720 s = PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
2721 PyUnicode_GET_SIZE(unicode));
2722
2723 if (!s)
2724 return NULL;
2725 result = PyString_FromStringAndSize(PyBytes_AS_STRING(s),
2726 PyBytes_GET_SIZE(s));
2727 Py_DECREF(s);
2728 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002729}
2730
Walter Dörwalda47d1c02005-08-30 10:23:14 +00002731/* --- Unicode Internal Codec ------------------------------------------- */
2732
2733PyObject *_PyUnicode_DecodeUnicodeInternal(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002734 Py_ssize_t size,
Walter Dörwalda47d1c02005-08-30 10:23:14 +00002735 const char *errors)
2736{
2737 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002738 Py_ssize_t startinpos;
2739 Py_ssize_t endinpos;
2740 Py_ssize_t outpos;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00002741 PyUnicodeObject *v;
2742 Py_UNICODE *p;
2743 const char *end;
2744 const char *reason;
2745 PyObject *errorHandler = NULL;
2746 PyObject *exc = NULL;
2747
Neal Norwitzd43069c2006-01-08 01:12:10 +00002748#ifdef Py_UNICODE_WIDE
2749 Py_UNICODE unimax = PyUnicode_GetMax();
2750#endif
2751
Thomas Wouters89f507f2006-12-13 04:49:30 +00002752 /* XXX overflow detection missing */
Walter Dörwalda47d1c02005-08-30 10:23:14 +00002753 v = _PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE);
2754 if (v == NULL)
2755 goto onError;
2756 if (PyUnicode_GetSize((PyObject *)v) == 0)
2757 return (PyObject *)v;
2758 p = PyUnicode_AS_UNICODE(v);
2759 end = s + size;
2760
2761 while (s < end) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00002762 memcpy(p, s, sizeof(Py_UNICODE));
Walter Dörwalda47d1c02005-08-30 10:23:14 +00002763 /* We have to sanity check the raw data, otherwise doom looms for
2764 some malformed UCS-4 data. */
2765 if (
2766 #ifdef Py_UNICODE_WIDE
2767 *p > unimax || *p < 0 ||
2768 #endif
2769 end-s < Py_UNICODE_SIZE
2770 )
2771 {
2772 startinpos = s - starts;
2773 if (end-s < Py_UNICODE_SIZE) {
2774 endinpos = end-starts;
2775 reason = "truncated input";
2776 }
2777 else {
2778 endinpos = s - starts + Py_UNICODE_SIZE;
2779 reason = "illegal code point (> 0x10FFFF)";
2780 }
2781 outpos = p - PyUnicode_AS_UNICODE(v);
2782 if (unicode_decode_call_errorhandler(
2783 errors, &errorHandler,
2784 "unicode_internal", reason,
2785 starts, size, &startinpos, &endinpos, &exc, &s,
2786 (PyObject **)&v, &outpos, &p)) {
2787 goto onError;
2788 }
2789 }
2790 else {
2791 p++;
2792 s += Py_UNICODE_SIZE;
2793 }
2794 }
2795
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002796 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00002797 goto onError;
2798 Py_XDECREF(errorHandler);
2799 Py_XDECREF(exc);
2800 return (PyObject *)v;
2801
2802 onError:
2803 Py_XDECREF(v);
2804 Py_XDECREF(errorHandler);
2805 Py_XDECREF(exc);
2806 return NULL;
2807}
2808
Guido van Rossumd57fd912000-03-10 22:53:23 +00002809/* --- Latin-1 Codec ------------------------------------------------------ */
2810
2811PyObject *PyUnicode_DecodeLatin1(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002812 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00002813 const char *errors)
2814{
2815 PyUnicodeObject *v;
2816 Py_UNICODE *p;
Tim Petersced69f82003-09-16 20:30:58 +00002817
Guido van Rossumd57fd912000-03-10 22:53:23 +00002818 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002819 if (size == 1) {
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002820 Py_UNICODE r = *(unsigned char*)s;
2821 return PyUnicode_FromUnicode(&r, 1);
2822 }
2823
Guido van Rossumd57fd912000-03-10 22:53:23 +00002824 v = _PyUnicode_New(size);
2825 if (v == NULL)
2826 goto onError;
2827 if (size == 0)
2828 return (PyObject *)v;
2829 p = PyUnicode_AS_UNICODE(v);
2830 while (size-- > 0)
2831 *p++ = (unsigned char)*s++;
2832 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00002833
Guido van Rossumd57fd912000-03-10 22:53:23 +00002834 onError:
2835 Py_XDECREF(v);
2836 return NULL;
2837}
2838
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002839/* create or adjust a UnicodeEncodeError */
2840static void make_encode_exception(PyObject **exceptionObject,
2841 const char *encoding,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002842 const Py_UNICODE *unicode, Py_ssize_t size,
2843 Py_ssize_t startpos, Py_ssize_t endpos,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002844 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002845{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002846 if (*exceptionObject == NULL) {
2847 *exceptionObject = PyUnicodeEncodeError_Create(
2848 encoding, unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002849 }
2850 else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002851 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
2852 goto onError;
2853 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
2854 goto onError;
2855 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
2856 goto onError;
2857 return;
2858 onError:
2859 Py_DECREF(*exceptionObject);
2860 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002861 }
2862}
2863
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002864/* raises a UnicodeEncodeError */
2865static void raise_encode_exception(PyObject **exceptionObject,
2866 const char *encoding,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002867 const Py_UNICODE *unicode, Py_ssize_t size,
2868 Py_ssize_t startpos, Py_ssize_t endpos,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002869 const char *reason)
2870{
2871 make_encode_exception(exceptionObject,
2872 encoding, unicode, size, startpos, endpos, reason);
2873 if (*exceptionObject != NULL)
2874 PyCodec_StrictErrors(*exceptionObject);
2875}
2876
2877/* error handling callback helper:
2878 build arguments, call the callback and check the arguments,
2879 put the result into newpos and return the replacement string, which
2880 has to be freed by the caller */
2881static PyObject *unicode_encode_call_errorhandler(const char *errors,
2882 PyObject **errorHandler,
2883 const char *encoding, const char *reason,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002884 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
2885 Py_ssize_t startpos, Py_ssize_t endpos,
2886 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002887{
Martin v. Löwis18e16552006-02-15 17:27:45 +00002888 static char *argparse = "O!n;encoding error handler must return (unicode, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002889
2890 PyObject *restuple;
2891 PyObject *resunicode;
2892
2893 if (*errorHandler == NULL) {
2894 *errorHandler = PyCodec_LookupError(errors);
2895 if (*errorHandler == NULL)
2896 return NULL;
2897 }
2898
2899 make_encode_exception(exceptionObject,
2900 encoding, unicode, size, startpos, endpos, reason);
2901 if (*exceptionObject == NULL)
2902 return NULL;
2903
2904 restuple = PyObject_CallFunctionObjArgs(
2905 *errorHandler, *exceptionObject, NULL);
2906 if (restuple == NULL)
2907 return NULL;
2908 if (!PyTuple_Check(restuple)) {
2909 PyErr_Format(PyExc_TypeError, &argparse[4]);
2910 Py_DECREF(restuple);
2911 return NULL;
2912 }
2913 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
2914 &resunicode, newpos)) {
2915 Py_DECREF(restuple);
2916 return NULL;
2917 }
2918 if (*newpos<0)
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00002919 *newpos = size+*newpos;
2920 if (*newpos<0 || *newpos>size) {
Martin v. Löwis2c95cc62006-02-16 06:54:25 +00002921 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00002922 Py_DECREF(restuple);
2923 return NULL;
2924 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002925 Py_INCREF(resunicode);
2926 Py_DECREF(restuple);
2927 return resunicode;
2928}
2929
2930static PyObject *unicode_encode_ucs1(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002931 Py_ssize_t size,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002932 const char *errors,
2933 int limit)
2934{
2935 /* output object */
2936 PyObject *res;
2937 /* pointers to the beginning and end+1 of input */
2938 const Py_UNICODE *startp = p;
2939 const Py_UNICODE *endp = p + size;
2940 /* pointer to the beginning of the unencodable characters */
2941 /* const Py_UNICODE *badp = NULL; */
2942 /* pointer into the output */
2943 char *str;
2944 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00002945 Py_ssize_t respos = 0;
2946 Py_ssize_t ressize;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002947 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
2948 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002949 PyObject *errorHandler = NULL;
2950 PyObject *exc = NULL;
2951 /* the following variable is used for caching string comparisons
2952 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
2953 int known_errorHandler = -1;
2954
2955 /* allocate enough for a simple encoding without
2956 replacements, if we need more, we'll resize */
Guido van Rossumf15a29f2007-05-04 00:41:39 +00002957 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002958 if (res == NULL)
2959 goto onError;
2960 if (size == 0)
2961 return res;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00002962 str = PyBytes_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002963 ressize = size;
2964
2965 while (p<endp) {
2966 Py_UNICODE c = *p;
2967
2968 /* can we encode this? */
2969 if (c<limit) {
2970 /* no overflow check, because we know that the space is enough */
2971 *str++ = (char)c;
2972 ++p;
2973 }
2974 else {
Martin v. Löwis18e16552006-02-15 17:27:45 +00002975 Py_ssize_t unicodepos = p-startp;
2976 Py_ssize_t requiredsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002977 PyObject *repunicode;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002978 Py_ssize_t repsize;
2979 Py_ssize_t newpos;
2980 Py_ssize_t respos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002981 Py_UNICODE *uni2;
2982 /* startpos for collecting unencodable chars */
2983 const Py_UNICODE *collstart = p;
2984 const Py_UNICODE *collend = p;
2985 /* find all unecodable characters */
2986 while ((collend < endp) && ((*collend)>=limit))
2987 ++collend;
2988 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
2989 if (known_errorHandler==-1) {
2990 if ((errors==NULL) || (!strcmp(errors, "strict")))
2991 known_errorHandler = 1;
2992 else if (!strcmp(errors, "replace"))
2993 known_errorHandler = 2;
2994 else if (!strcmp(errors, "ignore"))
2995 known_errorHandler = 3;
2996 else if (!strcmp(errors, "xmlcharrefreplace"))
2997 known_errorHandler = 4;
2998 else
2999 known_errorHandler = 0;
3000 }
3001 switch (known_errorHandler) {
3002 case 1: /* strict */
3003 raise_encode_exception(&exc, encoding, startp, size, collstart-startp, collend-startp, reason);
3004 goto onError;
3005 case 2: /* replace */
3006 while (collstart++<collend)
3007 *str++ = '?'; /* fall through */
3008 case 3: /* ignore */
3009 p = collend;
3010 break;
3011 case 4: /* xmlcharrefreplace */
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003012 respos = str - PyBytes_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003013 /* determine replacement size (temporarily (mis)uses p) */
3014 for (p = collstart, repsize = 0; p < collend; ++p) {
3015 if (*p<10)
3016 repsize += 2+1+1;
3017 else if (*p<100)
3018 repsize += 2+2+1;
3019 else if (*p<1000)
3020 repsize += 2+3+1;
3021 else if (*p<10000)
3022 repsize += 2+4+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00003023#ifndef Py_UNICODE_WIDE
3024 else
3025 repsize += 2+5+1;
3026#else
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003027 else if (*p<100000)
3028 repsize += 2+5+1;
3029 else if (*p<1000000)
3030 repsize += 2+6+1;
3031 else
3032 repsize += 2+7+1;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003033#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003034 }
3035 requiredsize = respos+repsize+(endp-collend);
3036 if (requiredsize > ressize) {
3037 if (requiredsize<2*ressize)
3038 requiredsize = 2*ressize;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003039 if (PyBytes_Resize(res, requiredsize))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003040 goto onError;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003041 str = PyBytes_AS_STRING(res) + respos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003042 ressize = requiredsize;
3043 }
3044 /* generate replacement (temporarily (mis)uses p) */
3045 for (p = collstart; p < collend; ++p) {
3046 str += sprintf(str, "&#%d;", (int)*p);
3047 }
3048 p = collend;
3049 break;
3050 default:
3051 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
3052 encoding, reason, startp, size, &exc,
3053 collstart-startp, collend-startp, &newpos);
3054 if (repunicode == NULL)
3055 goto onError;
3056 /* need more space? (at least enough for what we
3057 have+the replacement+the rest of the string, so
3058 we won't have to check space for encodable characters) */
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003059 respos = str - PyBytes_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003060 repsize = PyUnicode_GET_SIZE(repunicode);
3061 requiredsize = respos+repsize+(endp-collend);
3062 if (requiredsize > ressize) {
3063 if (requiredsize<2*ressize)
3064 requiredsize = 2*ressize;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003065 if (PyBytes_Resize(res, requiredsize)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003066 Py_DECREF(repunicode);
3067 goto onError;
3068 }
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003069 str = PyBytes_AS_STRING(res) + respos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003070 ressize = requiredsize;
3071 }
3072 /* check if there is anything unencodable in the replacement
3073 and copy it to the output */
3074 for (uni2 = PyUnicode_AS_UNICODE(repunicode);repsize-->0; ++uni2, ++str) {
3075 c = *uni2;
3076 if (c >= limit) {
3077 raise_encode_exception(&exc, encoding, startp, size,
3078 unicodepos, unicodepos+1, reason);
3079 Py_DECREF(repunicode);
3080 goto onError;
3081 }
3082 *str = (char)c;
3083 }
3084 p = startp + newpos;
3085 Py_DECREF(repunicode);
3086 }
3087 }
3088 }
3089 /* Resize if we allocated to much */
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003090 respos = str - PyBytes_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003091 if (respos<ressize)
3092 /* If this falls res will be NULL */
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003093 PyBytes_Resize(res, respos);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003094 Py_XDECREF(errorHandler);
3095 Py_XDECREF(exc);
3096 return res;
3097
3098 onError:
3099 Py_XDECREF(res);
3100 Py_XDECREF(errorHandler);
3101 Py_XDECREF(exc);
3102 return NULL;
3103}
3104
Guido van Rossumd57fd912000-03-10 22:53:23 +00003105PyObject *PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003106 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003107 const char *errors)
3108{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003109 return unicode_encode_ucs1(p, size, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003110}
3111
3112PyObject *PyUnicode_AsLatin1String(PyObject *unicode)
3113{
3114 if (!PyUnicode_Check(unicode)) {
3115 PyErr_BadArgument();
3116 return NULL;
3117 }
3118 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
3119 PyUnicode_GET_SIZE(unicode),
3120 NULL);
3121}
3122
3123/* --- 7-bit ASCII Codec -------------------------------------------------- */
3124
Guido van Rossumd57fd912000-03-10 22:53:23 +00003125PyObject *PyUnicode_DecodeASCII(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003126 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003127 const char *errors)
3128{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003129 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003130 PyUnicodeObject *v;
3131 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003132 Py_ssize_t startinpos;
3133 Py_ssize_t endinpos;
3134 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003135 const char *e;
3136 PyObject *errorHandler = NULL;
3137 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00003138
Guido van Rossumd57fd912000-03-10 22:53:23 +00003139 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003140 if (size == 1 && *(unsigned char*)s < 128) {
3141 Py_UNICODE r = *(unsigned char*)s;
3142 return PyUnicode_FromUnicode(&r, 1);
3143 }
Tim Petersced69f82003-09-16 20:30:58 +00003144
Guido van Rossumd57fd912000-03-10 22:53:23 +00003145 v = _PyUnicode_New(size);
3146 if (v == NULL)
3147 goto onError;
3148 if (size == 0)
3149 return (PyObject *)v;
3150 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003151 e = s + size;
3152 while (s < e) {
3153 register unsigned char c = (unsigned char)*s;
3154 if (c < 128) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003155 *p++ = c;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003156 ++s;
3157 }
3158 else {
3159 startinpos = s-starts;
3160 endinpos = startinpos + 1;
Jeremy Hyltond8082792003-09-16 19:41:39 +00003161 outpos = p - (Py_UNICODE *)PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003162 if (unicode_decode_call_errorhandler(
3163 errors, &errorHandler,
3164 "ascii", "ordinal not in range(128)",
3165 starts, size, &startinpos, &endinpos, &exc, &s,
3166 (PyObject **)&v, &outpos, &p))
Guido van Rossumd57fd912000-03-10 22:53:23 +00003167 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003168 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003169 }
Guido van Rossumfd4b9572000-04-10 13:51:10 +00003170 if (p - PyUnicode_AS_UNICODE(v) < PyString_GET_SIZE(v))
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003171 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Guido van Rossumfd4b9572000-04-10 13:51:10 +00003172 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003173 Py_XDECREF(errorHandler);
3174 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003175 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00003176
Guido van Rossumd57fd912000-03-10 22:53:23 +00003177 onError:
3178 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003179 Py_XDECREF(errorHandler);
3180 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003181 return NULL;
3182}
3183
Guido van Rossumd57fd912000-03-10 22:53:23 +00003184PyObject *PyUnicode_EncodeASCII(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003185 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003186 const char *errors)
3187{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003188 return unicode_encode_ucs1(p, size, errors, 128);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003189}
3190
3191PyObject *PyUnicode_AsASCIIString(PyObject *unicode)
3192{
3193 if (!PyUnicode_Check(unicode)) {
3194 PyErr_BadArgument();
3195 return NULL;
3196 }
3197 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
3198 PyUnicode_GET_SIZE(unicode),
3199 NULL);
3200}
3201
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00003202#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Guido van Rossum2ea3e142000-03-31 17:24:09 +00003203
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003204/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00003205
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003206#if SIZEOF_INT < SIZEOF_SSIZE_T
3207#define NEED_RETRY
3208#endif
3209
3210/* XXX This code is limited to "true" double-byte encodings, as
3211 a) it assumes an incomplete character consists of a single byte, and
3212 b) IsDBCSLeadByte (probably) does not work for non-DBCS multi-byte
3213 encodings, see IsDBCSLeadByteEx documentation. */
3214
3215static int is_dbcs_lead_byte(const char *s, int offset)
3216{
3217 const char *curr = s + offset;
3218
3219 if (IsDBCSLeadByte(*curr)) {
3220 const char *prev = CharPrev(s, curr);
3221 return (prev == curr) || !IsDBCSLeadByte(*prev) || (curr - prev == 2);
3222 }
3223 return 0;
3224}
3225
3226/*
3227 * Decode MBCS string into unicode object. If 'final' is set, converts
3228 * trailing lead-byte too. Returns consumed size if succeed, -1 otherwise.
3229 */
3230static int decode_mbcs(PyUnicodeObject **v,
3231 const char *s, /* MBCS string */
3232 int size, /* sizeof MBCS string */
3233 int final)
3234{
3235 Py_UNICODE *p;
3236 Py_ssize_t n = 0;
3237 int usize = 0;
3238
3239 assert(size >= 0);
3240
3241 /* Skip trailing lead-byte unless 'final' is set */
3242 if (!final && size >= 1 && is_dbcs_lead_byte(s, size - 1))
3243 --size;
3244
3245 /* First get the size of the result */
3246 if (size > 0) {
3247 usize = MultiByteToWideChar(CP_ACP, 0, s, size, NULL, 0);
3248 if (usize == 0) {
3249 PyErr_SetFromWindowsErrWithFilename(0, NULL);
3250 return -1;
3251 }
3252 }
3253
3254 if (*v == NULL) {
3255 /* Create unicode object */
3256 *v = _PyUnicode_New(usize);
3257 if (*v == NULL)
3258 return -1;
3259 }
3260 else {
3261 /* Extend unicode object */
3262 n = PyUnicode_GET_SIZE(*v);
3263 if (_PyUnicode_Resize(v, n + usize) < 0)
3264 return -1;
3265 }
3266
3267 /* Do the conversion */
3268 if (size > 0) {
3269 p = PyUnicode_AS_UNICODE(*v) + n;
3270 if (0 == MultiByteToWideChar(CP_ACP, 0, s, size, p, usize)) {
3271 PyErr_SetFromWindowsErrWithFilename(0, NULL);
3272 return -1;
3273 }
3274 }
3275
3276 return size;
3277}
3278
3279PyObject *PyUnicode_DecodeMBCSStateful(const char *s,
3280 Py_ssize_t size,
3281 const char *errors,
3282 Py_ssize_t *consumed)
3283{
3284 PyUnicodeObject *v = NULL;
3285 int done;
3286
3287 if (consumed)
3288 *consumed = 0;
3289
3290#ifdef NEED_RETRY
3291 retry:
3292 if (size > INT_MAX)
3293 done = decode_mbcs(&v, s, INT_MAX, 0);
3294 else
3295#endif
3296 done = decode_mbcs(&v, s, (int)size, !consumed);
3297
3298 if (done < 0) {
3299 Py_XDECREF(v);
3300 return NULL;
3301 }
3302
3303 if (consumed)
3304 *consumed += done;
3305
3306#ifdef NEED_RETRY
3307 if (size > INT_MAX) {
3308 s += done;
3309 size -= done;
3310 goto retry;
3311 }
3312#endif
3313
3314 return (PyObject *)v;
3315}
3316
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003317PyObject *PyUnicode_DecodeMBCS(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003318 Py_ssize_t size,
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003319 const char *errors)
3320{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003321 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
3322}
3323
3324/*
3325 * Convert unicode into string object (MBCS).
3326 * Returns 0 if succeed, -1 otherwise.
3327 */
3328static int encode_mbcs(PyObject **repr,
3329 const Py_UNICODE *p, /* unicode */
3330 int size) /* size of unicode */
3331{
3332 int mbcssize = 0;
3333 Py_ssize_t n = 0;
3334
3335 assert(size >= 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003336
3337 /* First get the size of the result */
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003338 if (size > 0) {
3339 mbcssize = WideCharToMultiByte(CP_ACP, 0, p, size, NULL, 0, NULL, NULL);
3340 if (mbcssize == 0) {
3341 PyErr_SetFromWindowsErrWithFilename(0, NULL);
3342 return -1;
3343 }
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003344 }
3345
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003346 if (*repr == NULL) {
3347 /* Create string object */
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003348 *repr = PyBytes_FromStringAndSize(NULL, mbcssize);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003349 if (*repr == NULL)
3350 return -1;
3351 }
3352 else {
3353 /* Extend string object */
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003354 n = PyBytes_Size(*repr);
3355 if (PyBytes_Resize(*repr, n + mbcssize) < 0)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003356 return -1;
3357 }
3358
3359 /* Do the conversion */
3360 if (size > 0) {
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003361 char *s = PyBytes_AS_STRING(*repr) + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003362 if (0 == WideCharToMultiByte(CP_ACP, 0, p, size, s, mbcssize, NULL, NULL)) {
3363 PyErr_SetFromWindowsErrWithFilename(0, NULL);
3364 return -1;
3365 }
3366 }
3367
3368 return 0;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003369}
3370
3371PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003372 Py_ssize_t size,
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003373 const char *errors)
3374{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003375 PyObject *repr = NULL;
3376 int ret;
Guido van Rossum03e29f12000-05-04 15:52:20 +00003377
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003378#ifdef NEED_RETRY
3379 retry:
3380 if (size > INT_MAX)
3381 ret = encode_mbcs(&repr, p, INT_MAX);
3382 else
3383#endif
3384 ret = encode_mbcs(&repr, p, (int)size);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003385
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003386 if (ret < 0) {
3387 Py_XDECREF(repr);
3388 return NULL;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003389 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003390
3391#ifdef NEED_RETRY
3392 if (size > INT_MAX) {
3393 p += INT_MAX;
3394 size -= INT_MAX;
3395 goto retry;
3396 }
3397#endif
3398
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003399 return repr;
3400}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00003401
Mark Hammond0ccda1e2003-07-01 00:13:27 +00003402PyObject *PyUnicode_AsMBCSString(PyObject *unicode)
3403{
3404 if (!PyUnicode_Check(unicode)) {
3405 PyErr_BadArgument();
3406 return NULL;
3407 }
3408 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
3409 PyUnicode_GET_SIZE(unicode),
3410 NULL);
3411}
3412
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003413#undef NEED_RETRY
3414
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00003415#endif /* MS_WINDOWS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003416
Guido van Rossumd57fd912000-03-10 22:53:23 +00003417/* --- Character Mapping Codec -------------------------------------------- */
3418
Guido van Rossumd57fd912000-03-10 22:53:23 +00003419PyObject *PyUnicode_DecodeCharmap(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003420 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003421 PyObject *mapping,
3422 const char *errors)
3423{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003424 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003425 Py_ssize_t startinpos;
3426 Py_ssize_t endinpos;
3427 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003428 const char *e;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003429 PyUnicodeObject *v;
3430 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003431 Py_ssize_t extrachars = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003432 PyObject *errorHandler = NULL;
3433 PyObject *exc = NULL;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003434 Py_UNICODE *mapstring = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003435 Py_ssize_t maplen = 0;
Tim Petersced69f82003-09-16 20:30:58 +00003436
Guido van Rossumd57fd912000-03-10 22:53:23 +00003437 /* Default to Latin-1 */
3438 if (mapping == NULL)
3439 return PyUnicode_DecodeLatin1(s, size, errors);
3440
3441 v = _PyUnicode_New(size);
3442 if (v == NULL)
3443 goto onError;
3444 if (size == 0)
3445 return (PyObject *)v;
3446 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003447 e = s + size;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003448 if (PyUnicode_CheckExact(mapping)) {
3449 mapstring = PyUnicode_AS_UNICODE(mapping);
3450 maplen = PyUnicode_GET_SIZE(mapping);
3451 while (s < e) {
3452 unsigned char ch = *s;
3453 Py_UNICODE x = 0xfffe; /* illegal value */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003454
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003455 if (ch < maplen)
3456 x = mapstring[ch];
Guido van Rossumd57fd912000-03-10 22:53:23 +00003457
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003458 if (x == 0xfffe) {
3459 /* undefined mapping */
3460 outpos = p-PyUnicode_AS_UNICODE(v);
3461 startinpos = s-starts;
3462 endinpos = startinpos+1;
3463 if (unicode_decode_call_errorhandler(
3464 errors, &errorHandler,
3465 "charmap", "character maps to <undefined>",
3466 starts, size, &startinpos, &endinpos, &exc, &s,
3467 (PyObject **)&v, &outpos, &p)) {
3468 goto onError;
Marc-André Lemburgec233e52001-01-06 14:59:58 +00003469 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003470 continue;
Marc-André Lemburgec233e52001-01-06 14:59:58 +00003471 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003472 *p++ = x;
3473 ++s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003474 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003475 }
3476 else {
3477 while (s < e) {
3478 unsigned char ch = *s;
3479 PyObject *w, *x;
3480
3481 /* Get mapping (char ordinal -> integer, Unicode char or None) */
3482 w = PyInt_FromLong((long)ch);
3483 if (w == NULL)
3484 goto onError;
3485 x = PyObject_GetItem(mapping, w);
3486 Py_DECREF(w);
3487 if (x == NULL) {
3488 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
3489 /* No mapping found means: mapping is undefined. */
3490 PyErr_Clear();
3491 x = Py_None;
3492 Py_INCREF(x);
3493 } else
3494 goto onError;
3495 }
3496
3497 /* Apply mapping */
3498 if (PyInt_Check(x)) {
3499 long value = PyInt_AS_LONG(x);
3500 if (value < 0 || value > 65535) {
3501 PyErr_SetString(PyExc_TypeError,
3502 "character mapping must be in range(65536)");
3503 Py_DECREF(x);
3504 goto onError;
3505 }
3506 *p++ = (Py_UNICODE)value;
3507 }
3508 else if (x == Py_None) {
3509 /* undefined mapping */
3510 outpos = p-PyUnicode_AS_UNICODE(v);
3511 startinpos = s-starts;
3512 endinpos = startinpos+1;
3513 if (unicode_decode_call_errorhandler(
3514 errors, &errorHandler,
3515 "charmap", "character maps to <undefined>",
3516 starts, size, &startinpos, &endinpos, &exc, &s,
3517 (PyObject **)&v, &outpos, &p)) {
3518 Py_DECREF(x);
3519 goto onError;
3520 }
Walter Dörwaldd4fff172005-11-28 22:15:56 +00003521 Py_DECREF(x);
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003522 continue;
3523 }
3524 else if (PyUnicode_Check(x)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00003525 Py_ssize_t targetsize = PyUnicode_GET_SIZE(x);
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003526
3527 if (targetsize == 1)
3528 /* 1-1 mapping */
3529 *p++ = *PyUnicode_AS_UNICODE(x);
3530
3531 else if (targetsize > 1) {
3532 /* 1-n mapping */
3533 if (targetsize > extrachars) {
3534 /* resize first */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003535 Py_ssize_t oldpos = p - PyUnicode_AS_UNICODE(v);
3536 Py_ssize_t needed = (targetsize - extrachars) + \
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003537 (targetsize << 2);
3538 extrachars += needed;
Thomas Wouters89f507f2006-12-13 04:49:30 +00003539 /* XXX overflow detection missing */
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003540 if (_PyUnicode_Resize(&v,
3541 PyUnicode_GET_SIZE(v) + needed) < 0) {
3542 Py_DECREF(x);
3543 goto onError;
3544 }
3545 p = PyUnicode_AS_UNICODE(v) + oldpos;
3546 }
3547 Py_UNICODE_COPY(p,
3548 PyUnicode_AS_UNICODE(x),
3549 targetsize);
3550 p += targetsize;
3551 extrachars -= targetsize;
3552 }
3553 /* 1-0 mapping: skip the character */
3554 }
3555 else {
3556 /* wrong return value */
3557 PyErr_SetString(PyExc_TypeError,
3558 "character mapping must return integer, None or unicode");
3559 Py_DECREF(x);
3560 goto onError;
3561 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003562 Py_DECREF(x);
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003563 ++s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003564 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003565 }
3566 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003567 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003568 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003569 Py_XDECREF(errorHandler);
3570 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003571 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00003572
Guido van Rossumd57fd912000-03-10 22:53:23 +00003573 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003574 Py_XDECREF(errorHandler);
3575 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003576 Py_XDECREF(v);
3577 return NULL;
3578}
3579
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003580/* Charmap encoding: the lookup table */
3581
3582struct encoding_map{
3583 PyObject_HEAD
3584 unsigned char level1[32];
3585 int count2, count3;
3586 unsigned char level23[1];
3587};
3588
3589static PyObject*
3590encoding_map_size(PyObject *obj, PyObject* args)
3591{
3592 struct encoding_map *map = (struct encoding_map*)obj;
3593 return PyInt_FromLong(sizeof(*map) - 1 + 16*map->count2 +
3594 128*map->count3);
3595}
3596
3597static PyMethodDef encoding_map_methods[] = {
3598 {"size", encoding_map_size, METH_NOARGS,
3599 PyDoc_STR("Return the size (in bytes) of this object") },
3600 { 0 }
3601};
3602
3603static void
3604encoding_map_dealloc(PyObject* o)
3605{
3606 PyObject_FREE(o);
3607}
3608
3609static PyTypeObject EncodingMapType = {
3610 PyObject_HEAD_INIT(NULL)
3611 0, /*ob_size*/
3612 "EncodingMap", /*tp_name*/
3613 sizeof(struct encoding_map), /*tp_basicsize*/
3614 0, /*tp_itemsize*/
3615 /* methods */
3616 encoding_map_dealloc, /*tp_dealloc*/
3617 0, /*tp_print*/
3618 0, /*tp_getattr*/
3619 0, /*tp_setattr*/
3620 0, /*tp_compare*/
3621 0, /*tp_repr*/
3622 0, /*tp_as_number*/
3623 0, /*tp_as_sequence*/
3624 0, /*tp_as_mapping*/
3625 0, /*tp_hash*/
3626 0, /*tp_call*/
3627 0, /*tp_str*/
3628 0, /*tp_getattro*/
3629 0, /*tp_setattro*/
3630 0, /*tp_as_buffer*/
3631 Py_TPFLAGS_DEFAULT, /*tp_flags*/
3632 0, /*tp_doc*/
3633 0, /*tp_traverse*/
3634 0, /*tp_clear*/
3635 0, /*tp_richcompare*/
3636 0, /*tp_weaklistoffset*/
3637 0, /*tp_iter*/
3638 0, /*tp_iternext*/
3639 encoding_map_methods, /*tp_methods*/
3640 0, /*tp_members*/
3641 0, /*tp_getset*/
3642 0, /*tp_base*/
3643 0, /*tp_dict*/
3644 0, /*tp_descr_get*/
3645 0, /*tp_descr_set*/
3646 0, /*tp_dictoffset*/
3647 0, /*tp_init*/
3648 0, /*tp_alloc*/
3649 0, /*tp_new*/
3650 0, /*tp_free*/
3651 0, /*tp_is_gc*/
3652};
3653
3654PyObject*
3655PyUnicode_BuildEncodingMap(PyObject* string)
3656{
3657 Py_UNICODE *decode;
3658 PyObject *result;
3659 struct encoding_map *mresult;
3660 int i;
3661 int need_dict = 0;
3662 unsigned char level1[32];
3663 unsigned char level2[512];
3664 unsigned char *mlevel1, *mlevel2, *mlevel3;
3665 int count2 = 0, count3 = 0;
3666
3667 if (!PyUnicode_Check(string) || PyUnicode_GetSize(string) != 256) {
3668 PyErr_BadArgument();
3669 return NULL;
3670 }
3671 decode = PyUnicode_AS_UNICODE(string);
3672 memset(level1, 0xFF, sizeof level1);
3673 memset(level2, 0xFF, sizeof level2);
3674
3675 /* If there isn't a one-to-one mapping of NULL to \0,
3676 or if there are non-BMP characters, we need to use
3677 a mapping dictionary. */
3678 if (decode[0] != 0)
3679 need_dict = 1;
3680 for (i = 1; i < 256; i++) {
3681 int l1, l2;
3682 if (decode[i] == 0
3683 #ifdef Py_UNICODE_WIDE
3684 || decode[i] > 0xFFFF
3685 #endif
3686 ) {
3687 need_dict = 1;
3688 break;
3689 }
3690 if (decode[i] == 0xFFFE)
3691 /* unmapped character */
3692 continue;
3693 l1 = decode[i] >> 11;
3694 l2 = decode[i] >> 7;
3695 if (level1[l1] == 0xFF)
3696 level1[l1] = count2++;
3697 if (level2[l2] == 0xFF)
3698 level2[l2] = count3++;
3699 }
3700
3701 if (count2 >= 0xFF || count3 >= 0xFF)
3702 need_dict = 1;
3703
3704 if (need_dict) {
3705 PyObject *result = PyDict_New();
3706 PyObject *key, *value;
3707 if (!result)
3708 return NULL;
3709 for (i = 0; i < 256; i++) {
3710 key = value = NULL;
3711 key = PyInt_FromLong(decode[i]);
3712 value = PyInt_FromLong(i);
3713 if (!key || !value)
3714 goto failed1;
3715 if (PyDict_SetItem(result, key, value) == -1)
3716 goto failed1;
3717 Py_DECREF(key);
3718 Py_DECREF(value);
3719 }
3720 return result;
3721 failed1:
3722 Py_XDECREF(key);
3723 Py_XDECREF(value);
3724 Py_DECREF(result);
3725 return NULL;
3726 }
3727
3728 /* Create a three-level trie */
3729 result = PyObject_MALLOC(sizeof(struct encoding_map) +
3730 16*count2 + 128*count3 - 1);
3731 if (!result)
3732 return PyErr_NoMemory();
3733 PyObject_Init(result, &EncodingMapType);
3734 mresult = (struct encoding_map*)result;
3735 mresult->count2 = count2;
3736 mresult->count3 = count3;
3737 mlevel1 = mresult->level1;
3738 mlevel2 = mresult->level23;
3739 mlevel3 = mresult->level23 + 16*count2;
3740 memcpy(mlevel1, level1, 32);
3741 memset(mlevel2, 0xFF, 16*count2);
3742 memset(mlevel3, 0, 128*count3);
3743 count3 = 0;
3744 for (i = 1; i < 256; i++) {
3745 int o1, o2, o3, i2, i3;
3746 if (decode[i] == 0xFFFE)
3747 /* unmapped character */
3748 continue;
3749 o1 = decode[i]>>11;
3750 o2 = (decode[i]>>7) & 0xF;
3751 i2 = 16*mlevel1[o1] + o2;
3752 if (mlevel2[i2] == 0xFF)
3753 mlevel2[i2] = count3++;
3754 o3 = decode[i] & 0x7F;
3755 i3 = 128*mlevel2[i2] + o3;
3756 mlevel3[i3] = i;
3757 }
3758 return result;
3759}
3760
3761static int
3762encoding_map_lookup(Py_UNICODE c, PyObject *mapping)
3763{
3764 struct encoding_map *map = (struct encoding_map*)mapping;
3765 int l1 = c>>11;
3766 int l2 = (c>>7) & 0xF;
3767 int l3 = c & 0x7F;
3768 int i;
3769
3770#ifdef Py_UNICODE_WIDE
3771 if (c > 0xFFFF) {
3772 return -1;
3773 }
3774#endif
3775 if (c == 0)
3776 return 0;
3777 /* level 1*/
3778 i = map->level1[l1];
3779 if (i == 0xFF) {
3780 return -1;
3781 }
3782 /* level 2*/
3783 i = map->level23[16*i+l2];
3784 if (i == 0xFF) {
3785 return -1;
3786 }
3787 /* level 3 */
3788 i = map->level23[16*map->count2 + 128*i + l3];
3789 if (i == 0) {
3790 return -1;
3791 }
3792 return i;
3793}
3794
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003795/* Lookup the character ch in the mapping. If the character
3796 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00003797 error occurred). */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003798static PyObject *charmapencode_lookup(Py_UNICODE c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003799{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003800 PyObject *w = PyInt_FromLong((long)c);
3801 PyObject *x;
3802
3803 if (w == NULL)
3804 return NULL;
3805 x = PyObject_GetItem(mapping, w);
3806 Py_DECREF(w);
3807 if (x == NULL) {
3808 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
3809 /* No mapping found means: mapping is undefined. */
3810 PyErr_Clear();
3811 x = Py_None;
3812 Py_INCREF(x);
3813 return x;
3814 } else
3815 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003816 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00003817 else if (x == Py_None)
3818 return x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003819 else if (PyInt_Check(x)) {
3820 long value = PyInt_AS_LONG(x);
3821 if (value < 0 || value > 255) {
3822 PyErr_SetString(PyExc_TypeError,
3823 "character mapping must be in range(256)");
3824 Py_DECREF(x);
3825 return NULL;
3826 }
3827 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003828 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003829 else if (PyString_Check(x))
3830 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003831 else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003832 /* wrong return value */
Walter Dörwald580ceed2007-05-09 10:39:19 +00003833 PyErr_Format(PyExc_TypeError,
3834 "character mapping must return integer, None or str8, not %.400s",
3835 x->ob_type->tp_name);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003836 Py_DECREF(x);
3837 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003838 }
3839}
3840
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003841static int
Walter Dörwald827b0552007-05-12 13:23:53 +00003842charmapencode_resize(PyObject *outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003843{
Walter Dörwald827b0552007-05-12 13:23:53 +00003844 Py_ssize_t outsize = PyBytes_GET_SIZE( outobj);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003845 /* exponentially overallocate to minimize reallocations */
3846 if (requiredsize < 2*outsize)
3847 requiredsize = 2*outsize;
Walter Dörwald827b0552007-05-12 13:23:53 +00003848 if (PyBytes_Resize(outobj, requiredsize)) {
3849 Py_DECREF(outobj);
3850 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003851 }
Walter Dörwald827b0552007-05-12 13:23:53 +00003852 return 0;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003853}
3854
3855typedef enum charmapencode_result {
3856 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
3857}charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003858/* lookup the character, put the result in the output string and adjust
Walter Dörwald827b0552007-05-12 13:23:53 +00003859 various state variables. Resize the output bytes object if not enough
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003860 space is available. Return a new reference to the object that
3861 was put in the output buffer, or Py_None, if the mapping was undefined
3862 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00003863 reallocation error occurred. The caller must decref the result */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003864static
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003865charmapencode_result charmapencode_output(Py_UNICODE c, PyObject *mapping,
Walter Dörwald827b0552007-05-12 13:23:53 +00003866 PyObject *outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003867{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003868 PyObject *rep;
3869 char *outstart;
Walter Dörwald827b0552007-05-12 13:23:53 +00003870 Py_ssize_t outsize = PyBytes_GET_SIZE(outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003871
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003872 if (mapping->ob_type == &EncodingMapType) {
3873 int res = encoding_map_lookup(c, mapping);
3874 Py_ssize_t requiredsize = *outpos+1;
3875 if (res == -1)
3876 return enc_FAILED;
3877 if (outsize<requiredsize)
Walter Dörwald827b0552007-05-12 13:23:53 +00003878 if (charmapencode_resize(outobj, outpos, requiredsize))
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003879 return enc_EXCEPTION;
Walter Dörwald827b0552007-05-12 13:23:53 +00003880 outstart = PyBytes_AS_STRING(outobj);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003881 outstart[(*outpos)++] = (char)res;
3882 return enc_SUCCESS;
3883 }
3884
3885 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003886 if (rep==NULL)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003887 return enc_EXCEPTION;
3888 else if (rep==Py_None) {
3889 Py_DECREF(rep);
3890 return enc_FAILED;
3891 } else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003892 if (PyInt_Check(rep)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00003893 Py_ssize_t requiredsize = *outpos+1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003894 if (outsize<requiredsize)
Walter Dörwald827b0552007-05-12 13:23:53 +00003895 if (charmapencode_resize(outobj, outpos, requiredsize)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003896 Py_DECREF(rep);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003897 return enc_EXCEPTION;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003898 }
Walter Dörwald827b0552007-05-12 13:23:53 +00003899 outstart = PyBytes_AS_STRING(outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003900 outstart[(*outpos)++] = (char)PyInt_AS_LONG(rep);
3901 }
3902 else {
3903 const char *repchars = PyString_AS_STRING(rep);
Martin v. Löwis18e16552006-02-15 17:27:45 +00003904 Py_ssize_t repsize = PyString_GET_SIZE(rep);
3905 Py_ssize_t requiredsize = *outpos+repsize;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003906 if (outsize<requiredsize)
Walter Dörwald827b0552007-05-12 13:23:53 +00003907 if (charmapencode_resize(outobj, outpos, requiredsize)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003908 Py_DECREF(rep);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003909 return enc_EXCEPTION;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003910 }
Walter Dörwald827b0552007-05-12 13:23:53 +00003911 outstart = PyBytes_AS_STRING(outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003912 memcpy(outstart + *outpos, repchars, repsize);
3913 *outpos += repsize;
3914 }
3915 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003916 Py_DECREF(rep);
3917 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003918}
3919
3920/* handle an error in PyUnicode_EncodeCharmap
3921 Return 0 on success, -1 on error */
3922static
3923int charmap_encoding_error(
Martin v. Löwis18e16552006-02-15 17:27:45 +00003924 const Py_UNICODE *p, Py_ssize_t size, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003925 PyObject **exceptionObject,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00003926 int *known_errorHandler, PyObject **errorHandler, const char *errors,
Walter Dörwald827b0552007-05-12 13:23:53 +00003927 PyObject *res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003928{
3929 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003930 Py_ssize_t repsize;
3931 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003932 Py_UNICODE *uni2;
3933 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003934 Py_ssize_t collstartpos = *inpos;
3935 Py_ssize_t collendpos = *inpos+1;
3936 Py_ssize_t collpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003937 char *encoding = "charmap";
3938 char *reason = "character maps to <undefined>";
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003939 charmapencode_result x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003940
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003941 /* find all unencodable characters */
3942 while (collendpos < size) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003943 PyObject *rep;
3944 if (mapping->ob_type == &EncodingMapType) {
3945 int res = encoding_map_lookup(p[collendpos], mapping);
3946 if (res != -1)
3947 break;
3948 ++collendpos;
3949 continue;
3950 }
3951
3952 rep = charmapencode_lookup(p[collendpos], mapping);
3953 if (rep==NULL)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003954 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003955 else if (rep!=Py_None) {
3956 Py_DECREF(rep);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003957 break;
3958 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003959 Py_DECREF(rep);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003960 ++collendpos;
3961 }
3962 /* cache callback name lookup
3963 * (if not done yet, i.e. it's the first error) */
3964 if (*known_errorHandler==-1) {
3965 if ((errors==NULL) || (!strcmp(errors, "strict")))
3966 *known_errorHandler = 1;
3967 else if (!strcmp(errors, "replace"))
3968 *known_errorHandler = 2;
3969 else if (!strcmp(errors, "ignore"))
3970 *known_errorHandler = 3;
3971 else if (!strcmp(errors, "xmlcharrefreplace"))
3972 *known_errorHandler = 4;
3973 else
3974 *known_errorHandler = 0;
3975 }
3976 switch (*known_errorHandler) {
3977 case 1: /* strict */
3978 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
3979 return -1;
3980 case 2: /* replace */
3981 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
3982 x = charmapencode_output('?', mapping, res, respos);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003983 if (x==enc_EXCEPTION) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003984 return -1;
3985 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003986 else if (x==enc_FAILED) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003987 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
3988 return -1;
3989 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003990 }
3991 /* fall through */
3992 case 3: /* ignore */
3993 *inpos = collendpos;
3994 break;
3995 case 4: /* xmlcharrefreplace */
3996 /* generate replacement (temporarily (mis)uses p) */
3997 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
3998 char buffer[2+29+1+1];
3999 char *cp;
4000 sprintf(buffer, "&#%d;", (int)p[collpos]);
4001 for (cp = buffer; *cp; ++cp) {
4002 x = charmapencode_output(*cp, mapping, res, respos);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004003 if (x==enc_EXCEPTION)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004004 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004005 else if (x==enc_FAILED) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004006 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4007 return -1;
4008 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004009 }
4010 }
4011 *inpos = collendpos;
4012 break;
4013 default:
Walter Dörwalde5402fb2003-08-14 20:25:29 +00004014 repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004015 encoding, reason, p, size, exceptionObject,
4016 collstartpos, collendpos, &newpos);
4017 if (repunicode == NULL)
4018 return -1;
4019 /* generate replacement */
4020 repsize = PyUnicode_GET_SIZE(repunicode);
4021 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
4022 x = charmapencode_output(*uni2, mapping, res, respos);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004023 if (x==enc_EXCEPTION) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004024 return -1;
4025 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004026 else if (x==enc_FAILED) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004027 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004028 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4029 return -1;
4030 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004031 }
4032 *inpos = newpos;
4033 Py_DECREF(repunicode);
4034 }
4035 return 0;
4036}
4037
Guido van Rossumd57fd912000-03-10 22:53:23 +00004038PyObject *PyUnicode_EncodeCharmap(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004039 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004040 PyObject *mapping,
4041 const char *errors)
4042{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004043 /* output object */
4044 PyObject *res = NULL;
4045 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004046 Py_ssize_t inpos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004047 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004048 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004049 PyObject *errorHandler = NULL;
4050 PyObject *exc = NULL;
4051 /* the following variable is used for caching string comparisons
4052 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
4053 * 3=ignore, 4=xmlcharrefreplace */
4054 int known_errorHandler = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004055
4056 /* Default to Latin-1 */
4057 if (mapping == NULL)
4058 return PyUnicode_EncodeLatin1(p, size, errors);
4059
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004060 /* allocate enough for a simple encoding without
4061 replacements, if we need more, we'll resize */
Walter Dörwald827b0552007-05-12 13:23:53 +00004062 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004063 if (res == NULL)
4064 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00004065 if (size == 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004066 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004067
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004068 while (inpos<size) {
4069 /* try to encode it */
Walter Dörwald827b0552007-05-12 13:23:53 +00004070 charmapencode_result x = charmapencode_output(p[inpos], mapping, res, &respos);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004071 if (x==enc_EXCEPTION) /* error */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004072 goto onError;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004073 if (x==enc_FAILED) { /* unencodable character */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004074 if (charmap_encoding_error(p, size, &inpos, mapping,
4075 &exc,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00004076 &known_errorHandler, &errorHandler, errors,
Walter Dörwald827b0552007-05-12 13:23:53 +00004077 res, &respos)) {
Marc-André Lemburg3a645e42001-01-16 11:54:12 +00004078 goto onError;
Walter Dörwald9b30f202003-08-15 16:26:34 +00004079 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004080 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004081 else
4082 /* done with this character => adjust input position */
4083 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004084 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004085
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004086 /* Resize if we allocated to much */
Walter Dörwald827b0552007-05-12 13:23:53 +00004087 if (respos<PyBytes_GET_SIZE(res)) {
4088 if (PyBytes_Resize(res, respos))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004089 goto onError;
4090 }
4091 Py_XDECREF(exc);
4092 Py_XDECREF(errorHandler);
4093 return res;
4094
4095 onError:
4096 Py_XDECREF(res);
4097 Py_XDECREF(exc);
4098 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004099 return NULL;
4100}
4101
4102PyObject *PyUnicode_AsCharmapString(PyObject *unicode,
4103 PyObject *mapping)
4104{
4105 if (!PyUnicode_Check(unicode) || mapping == NULL) {
4106 PyErr_BadArgument();
4107 return NULL;
4108 }
4109 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
4110 PyUnicode_GET_SIZE(unicode),
4111 mapping,
4112 NULL);
4113}
4114
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004115/* create or adjust a UnicodeTranslateError */
4116static void make_translate_exception(PyObject **exceptionObject,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004117 const Py_UNICODE *unicode, Py_ssize_t size,
4118 Py_ssize_t startpos, Py_ssize_t endpos,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004119 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004120{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004121 if (*exceptionObject == NULL) {
4122 *exceptionObject = PyUnicodeTranslateError_Create(
4123 unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004124 }
4125 else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004126 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
4127 goto onError;
4128 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
4129 goto onError;
4130 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
4131 goto onError;
4132 return;
4133 onError:
4134 Py_DECREF(*exceptionObject);
4135 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004136 }
4137}
4138
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004139/* raises a UnicodeTranslateError */
4140static void raise_translate_exception(PyObject **exceptionObject,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004141 const Py_UNICODE *unicode, Py_ssize_t size,
4142 Py_ssize_t startpos, Py_ssize_t endpos,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004143 const char *reason)
4144{
4145 make_translate_exception(exceptionObject,
4146 unicode, size, startpos, endpos, reason);
4147 if (*exceptionObject != NULL)
4148 PyCodec_StrictErrors(*exceptionObject);
4149}
4150
4151/* error handling callback helper:
4152 build arguments, call the callback and check the arguments,
4153 put the result into newpos and return the replacement string, which
4154 has to be freed by the caller */
4155static PyObject *unicode_translate_call_errorhandler(const char *errors,
4156 PyObject **errorHandler,
4157 const char *reason,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004158 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
4159 Py_ssize_t startpos, Py_ssize_t endpos,
4160 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004161{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004162 static char *argparse = "O!n;translating error handler must return (unicode, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004163
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004164 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004165 PyObject *restuple;
4166 PyObject *resunicode;
4167
4168 if (*errorHandler == NULL) {
4169 *errorHandler = PyCodec_LookupError(errors);
4170 if (*errorHandler == NULL)
4171 return NULL;
4172 }
4173
4174 make_translate_exception(exceptionObject,
4175 unicode, size, startpos, endpos, reason);
4176 if (*exceptionObject == NULL)
4177 return NULL;
4178
4179 restuple = PyObject_CallFunctionObjArgs(
4180 *errorHandler, *exceptionObject, NULL);
4181 if (restuple == NULL)
4182 return NULL;
4183 if (!PyTuple_Check(restuple)) {
4184 PyErr_Format(PyExc_TypeError, &argparse[4]);
4185 Py_DECREF(restuple);
4186 return NULL;
4187 }
4188 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004189 &resunicode, &i_newpos)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004190 Py_DECREF(restuple);
4191 return NULL;
4192 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00004193 if (i_newpos<0)
4194 *newpos = size+i_newpos;
4195 else
4196 *newpos = i_newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004197 if (*newpos<0 || *newpos>size) {
Martin v. Löwis2c95cc62006-02-16 06:54:25 +00004198 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004199 Py_DECREF(restuple);
4200 return NULL;
4201 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004202 Py_INCREF(resunicode);
4203 Py_DECREF(restuple);
4204 return resunicode;
4205}
4206
4207/* Lookup the character ch in the mapping and put the result in result,
4208 which must be decrefed by the caller.
4209 Return 0 on success, -1 on error */
4210static
4211int charmaptranslate_lookup(Py_UNICODE c, PyObject *mapping, PyObject **result)
4212{
4213 PyObject *w = PyInt_FromLong((long)c);
4214 PyObject *x;
4215
4216 if (w == NULL)
4217 return -1;
4218 x = PyObject_GetItem(mapping, w);
4219 Py_DECREF(w);
4220 if (x == NULL) {
4221 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
4222 /* No mapping found means: use 1:1 mapping. */
4223 PyErr_Clear();
4224 *result = NULL;
4225 return 0;
4226 } else
4227 return -1;
4228 }
4229 else if (x == Py_None) {
4230 *result = x;
4231 return 0;
4232 }
4233 else if (PyInt_Check(x)) {
4234 long value = PyInt_AS_LONG(x);
4235 long max = PyUnicode_GetMax();
4236 if (value < 0 || value > max) {
4237 PyErr_Format(PyExc_TypeError,
4238 "character mapping must be in range(0x%lx)", max+1);
4239 Py_DECREF(x);
4240 return -1;
4241 }
4242 *result = x;
4243 return 0;
4244 }
4245 else if (PyUnicode_Check(x)) {
4246 *result = x;
4247 return 0;
4248 }
4249 else {
4250 /* wrong return value */
4251 PyErr_SetString(PyExc_TypeError,
4252 "character mapping must return integer, None or unicode");
Walter Dörwald150523e2003-08-15 16:52:19 +00004253 Py_DECREF(x);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004254 return -1;
4255 }
4256}
4257/* ensure that *outobj is at least requiredsize characters long,
4258if not reallocate and adjust various state variables.
4259Return 0 on success, -1 on error */
4260static
Walter Dörwald4894c302003-10-24 14:25:28 +00004261int charmaptranslate_makespace(PyObject **outobj, Py_UNICODE **outp,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004262 Py_ssize_t requiredsize)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004263{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004264 Py_ssize_t oldsize = PyUnicode_GET_SIZE(*outobj);
Walter Dörwald4894c302003-10-24 14:25:28 +00004265 if (requiredsize > oldsize) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004266 /* remember old output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004267 Py_ssize_t outpos = *outp-PyUnicode_AS_UNICODE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004268 /* exponentially overallocate to minimize reallocations */
Walter Dörwald4894c302003-10-24 14:25:28 +00004269 if (requiredsize < 2 * oldsize)
4270 requiredsize = 2 * oldsize;
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00004271 if (_PyUnicode_Resize(outobj, requiredsize) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004272 return -1;
4273 *outp = PyUnicode_AS_UNICODE(*outobj) + outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004274 }
4275 return 0;
4276}
4277/* lookup the character, put the result in the output string and adjust
4278 various state variables. Return a new reference to the object that
4279 was put in the output buffer in *result, or Py_None, if the mapping was
4280 undefined (in which case no character was written).
4281 The called must decref result.
4282 Return 0 on success, -1 on error. */
4283static
Walter Dörwald4894c302003-10-24 14:25:28 +00004284int charmaptranslate_output(const Py_UNICODE *startinp, const Py_UNICODE *curinp,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004285 Py_ssize_t insize, PyObject *mapping, PyObject **outobj, Py_UNICODE **outp,
Walter Dörwald4894c302003-10-24 14:25:28 +00004286 PyObject **res)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004287{
Walter Dörwald4894c302003-10-24 14:25:28 +00004288 if (charmaptranslate_lookup(*curinp, mapping, res))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004289 return -1;
4290 if (*res==NULL) {
4291 /* not found => default to 1:1 mapping */
Walter Dörwald4894c302003-10-24 14:25:28 +00004292 *(*outp)++ = *curinp;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004293 }
4294 else if (*res==Py_None)
4295 ;
4296 else if (PyInt_Check(*res)) {
4297 /* no overflow check, because we know that the space is enough */
4298 *(*outp)++ = (Py_UNICODE)PyInt_AS_LONG(*res);
4299 }
4300 else if (PyUnicode_Check(*res)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00004301 Py_ssize_t repsize = PyUnicode_GET_SIZE(*res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004302 if (repsize==1) {
4303 /* no overflow check, because we know that the space is enough */
4304 *(*outp)++ = *PyUnicode_AS_UNICODE(*res);
4305 }
4306 else if (repsize!=0) {
4307 /* more than one character */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004308 Py_ssize_t requiredsize = (*outp-PyUnicode_AS_UNICODE(*outobj)) +
Walter Dörwaldcd736e72004-02-05 17:36:00 +00004309 (insize - (curinp-startinp)) +
Walter Dörwald4894c302003-10-24 14:25:28 +00004310 repsize - 1;
4311 if (charmaptranslate_makespace(outobj, outp, requiredsize))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004312 return -1;
4313 memcpy(*outp, PyUnicode_AS_UNICODE(*res), sizeof(Py_UNICODE)*repsize);
4314 *outp += repsize;
4315 }
4316 }
4317 else
4318 return -1;
4319 return 0;
4320}
4321
4322PyObject *PyUnicode_TranslateCharmap(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004323 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004324 PyObject *mapping,
4325 const char *errors)
4326{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004327 /* output object */
4328 PyObject *res = NULL;
4329 /* pointers to the beginning and end+1 of input */
4330 const Py_UNICODE *startp = p;
4331 const Py_UNICODE *endp = p + size;
4332 /* pointer into the output */
4333 Py_UNICODE *str;
4334 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004335 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004336 char *reason = "character maps to <undefined>";
4337 PyObject *errorHandler = NULL;
4338 PyObject *exc = NULL;
4339 /* the following variable is used for caching string comparisons
4340 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
4341 * 3=ignore, 4=xmlcharrefreplace */
4342 int known_errorHandler = -1;
4343
Guido van Rossumd57fd912000-03-10 22:53:23 +00004344 if (mapping == NULL) {
4345 PyErr_BadArgument();
4346 return NULL;
4347 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004348
4349 /* allocate enough for a simple 1:1 translation without
4350 replacements, if we need more, we'll resize */
4351 res = PyUnicode_FromUnicode(NULL, size);
4352 if (res == NULL)
Walter Dörwald4894c302003-10-24 14:25:28 +00004353 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004354 if (size == 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004355 return res;
4356 str = PyUnicode_AS_UNICODE(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004357
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004358 while (p<endp) {
4359 /* try to encode it */
4360 PyObject *x = NULL;
Walter Dörwald4894c302003-10-24 14:25:28 +00004361 if (charmaptranslate_output(startp, p, size, mapping, &res, &str, &x)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004362 Py_XDECREF(x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004363 goto onError;
4364 }
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00004365 Py_XDECREF(x);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004366 if (x!=Py_None) /* it worked => adjust input pointer */
4367 ++p;
4368 else { /* untranslatable character */
4369 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004370 Py_ssize_t repsize;
4371 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004372 Py_UNICODE *uni2;
4373 /* startpos for collecting untranslatable chars */
4374 const Py_UNICODE *collstart = p;
4375 const Py_UNICODE *collend = p+1;
4376 const Py_UNICODE *coll;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004377
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004378 /* find all untranslatable characters */
4379 while (collend < endp) {
Walter Dörwald4894c302003-10-24 14:25:28 +00004380 if (charmaptranslate_lookup(*collend, mapping, &x))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004381 goto onError;
4382 Py_XDECREF(x);
4383 if (x!=Py_None)
4384 break;
4385 ++collend;
4386 }
4387 /* cache callback name lookup
4388 * (if not done yet, i.e. it's the first error) */
4389 if (known_errorHandler==-1) {
4390 if ((errors==NULL) || (!strcmp(errors, "strict")))
4391 known_errorHandler = 1;
4392 else if (!strcmp(errors, "replace"))
4393 known_errorHandler = 2;
4394 else if (!strcmp(errors, "ignore"))
4395 known_errorHandler = 3;
4396 else if (!strcmp(errors, "xmlcharrefreplace"))
4397 known_errorHandler = 4;
4398 else
4399 known_errorHandler = 0;
4400 }
4401 switch (known_errorHandler) {
4402 case 1: /* strict */
4403 raise_translate_exception(&exc, startp, size, collstart-startp, collend-startp, reason);
4404 goto onError;
4405 case 2: /* replace */
4406 /* No need to check for space, this is a 1:1 replacement */
4407 for (coll = collstart; coll<collend; ++coll)
4408 *str++ = '?';
4409 /* fall through */
4410 case 3: /* ignore */
4411 p = collend;
4412 break;
4413 case 4: /* xmlcharrefreplace */
4414 /* generate replacement (temporarily (mis)uses p) */
4415 for (p = collstart; p < collend; ++p) {
4416 char buffer[2+29+1+1];
4417 char *cp;
4418 sprintf(buffer, "&#%d;", (int)*p);
Walter Dörwald4894c302003-10-24 14:25:28 +00004419 if (charmaptranslate_makespace(&res, &str,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004420 (str-PyUnicode_AS_UNICODE(res))+strlen(buffer)+(endp-collend)))
4421 goto onError;
4422 for (cp = buffer; *cp; ++cp)
4423 *str++ = *cp;
4424 }
4425 p = collend;
4426 break;
4427 default:
4428 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
4429 reason, startp, size, &exc,
4430 collstart-startp, collend-startp, &newpos);
4431 if (repunicode == NULL)
4432 goto onError;
4433 /* generate replacement */
4434 repsize = PyUnicode_GET_SIZE(repunicode);
Walter Dörwald4894c302003-10-24 14:25:28 +00004435 if (charmaptranslate_makespace(&res, &str,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004436 (str-PyUnicode_AS_UNICODE(res))+repsize+(endp-collend))) {
4437 Py_DECREF(repunicode);
4438 goto onError;
4439 }
4440 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2)
4441 *str++ = *uni2;
4442 p = startp + newpos;
4443 Py_DECREF(repunicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004444 }
4445 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004446 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004447 /* Resize if we allocated to much */
4448 respos = str-PyUnicode_AS_UNICODE(res);
Walter Dörwald4894c302003-10-24 14:25:28 +00004449 if (respos<PyUnicode_GET_SIZE(res)) {
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00004450 if (_PyUnicode_Resize(&res, respos) < 0)
Guido van Rossumfd4b9572000-04-10 13:51:10 +00004451 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004452 }
4453 Py_XDECREF(exc);
4454 Py_XDECREF(errorHandler);
4455 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004456
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004457 onError:
4458 Py_XDECREF(res);
4459 Py_XDECREF(exc);
4460 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004461 return NULL;
4462}
4463
4464PyObject *PyUnicode_Translate(PyObject *str,
4465 PyObject *mapping,
4466 const char *errors)
4467{
4468 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00004469
Guido van Rossumd57fd912000-03-10 22:53:23 +00004470 str = PyUnicode_FromObject(str);
4471 if (str == NULL)
4472 goto onError;
4473 result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str),
4474 PyUnicode_GET_SIZE(str),
4475 mapping,
4476 errors);
4477 Py_DECREF(str);
4478 return result;
Tim Petersced69f82003-09-16 20:30:58 +00004479
Guido van Rossumd57fd912000-03-10 22:53:23 +00004480 onError:
4481 Py_XDECREF(str);
4482 return NULL;
4483}
Tim Petersced69f82003-09-16 20:30:58 +00004484
Guido van Rossum9e896b32000-04-05 20:11:21 +00004485/* --- Decimal Encoder ---------------------------------------------------- */
4486
4487int PyUnicode_EncodeDecimal(Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004488 Py_ssize_t length,
Guido van Rossum9e896b32000-04-05 20:11:21 +00004489 char *output,
4490 const char *errors)
4491{
4492 Py_UNICODE *p, *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004493 PyObject *errorHandler = NULL;
4494 PyObject *exc = NULL;
4495 const char *encoding = "decimal";
4496 const char *reason = "invalid decimal Unicode string";
4497 /* the following variable is used for caching string comparisons
4498 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
4499 int known_errorHandler = -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00004500
4501 if (output == NULL) {
4502 PyErr_BadArgument();
4503 return -1;
4504 }
4505
4506 p = s;
4507 end = s + length;
4508 while (p < end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004509 register Py_UNICODE ch = *p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00004510 int decimal;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004511 PyObject *repunicode;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004512 Py_ssize_t repsize;
4513 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004514 Py_UNICODE *uni2;
4515 Py_UNICODE *collstart;
4516 Py_UNICODE *collend;
Tim Petersced69f82003-09-16 20:30:58 +00004517
Guido van Rossum9e896b32000-04-05 20:11:21 +00004518 if (Py_UNICODE_ISSPACE(ch)) {
4519 *output++ = ' ';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004520 ++p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00004521 continue;
4522 }
4523 decimal = Py_UNICODE_TODECIMAL(ch);
4524 if (decimal >= 0) {
4525 *output++ = '0' + decimal;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004526 ++p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00004527 continue;
4528 }
Guido van Rossumba477042000-04-06 18:18:10 +00004529 if (0 < ch && ch < 256) {
Guido van Rossum42c29aa2000-05-03 23:58:29 +00004530 *output++ = (char)ch;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004531 ++p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00004532 continue;
4533 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004534 /* All other characters are considered unencodable */
4535 collstart = p;
4536 collend = p+1;
4537 while (collend < end) {
4538 if ((0 < *collend && *collend < 256) ||
4539 !Py_UNICODE_ISSPACE(*collend) ||
4540 Py_UNICODE_TODECIMAL(*collend))
4541 break;
Guido van Rossum9e896b32000-04-05 20:11:21 +00004542 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004543 /* cache callback name lookup
4544 * (if not done yet, i.e. it's the first error) */
4545 if (known_errorHandler==-1) {
4546 if ((errors==NULL) || (!strcmp(errors, "strict")))
4547 known_errorHandler = 1;
4548 else if (!strcmp(errors, "replace"))
4549 known_errorHandler = 2;
4550 else if (!strcmp(errors, "ignore"))
4551 known_errorHandler = 3;
4552 else if (!strcmp(errors, "xmlcharrefreplace"))
4553 known_errorHandler = 4;
4554 else
4555 known_errorHandler = 0;
4556 }
4557 switch (known_errorHandler) {
4558 case 1: /* strict */
4559 raise_encode_exception(&exc, encoding, s, length, collstart-s, collend-s, reason);
4560 goto onError;
4561 case 2: /* replace */
4562 for (p = collstart; p < collend; ++p)
4563 *output++ = '?';
4564 /* fall through */
4565 case 3: /* ignore */
4566 p = collend;
4567 break;
4568 case 4: /* xmlcharrefreplace */
4569 /* generate replacement (temporarily (mis)uses p) */
4570 for (p = collstart; p < collend; ++p)
4571 output += sprintf(output, "&#%d;", (int)*p);
4572 p = collend;
4573 break;
4574 default:
4575 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
4576 encoding, reason, s, length, &exc,
4577 collstart-s, collend-s, &newpos);
4578 if (repunicode == NULL)
4579 goto onError;
4580 /* generate replacement */
4581 repsize = PyUnicode_GET_SIZE(repunicode);
4582 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
4583 Py_UNICODE ch = *uni2;
4584 if (Py_UNICODE_ISSPACE(ch))
4585 *output++ = ' ';
4586 else {
4587 decimal = Py_UNICODE_TODECIMAL(ch);
4588 if (decimal >= 0)
4589 *output++ = '0' + decimal;
4590 else if (0 < ch && ch < 256)
4591 *output++ = (char)ch;
4592 else {
4593 Py_DECREF(repunicode);
4594 raise_encode_exception(&exc, encoding,
4595 s, length, collstart-s, collend-s, reason);
4596 goto onError;
4597 }
4598 }
4599 }
4600 p = s + newpos;
4601 Py_DECREF(repunicode);
Guido van Rossum9e896b32000-04-05 20:11:21 +00004602 }
4603 }
4604 /* 0-terminate the output string */
4605 *output++ = '\0';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004606 Py_XDECREF(exc);
4607 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00004608 return 0;
4609
4610 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004611 Py_XDECREF(exc);
4612 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00004613 return -1;
4614}
4615
Guido van Rossumd57fd912000-03-10 22:53:23 +00004616/* --- Helpers ------------------------------------------------------------ */
4617
Thomas Wouters477c8d52006-05-27 19:21:47 +00004618#define STRINGLIB_CHAR Py_UNICODE
4619
4620#define STRINGLIB_LEN PyUnicode_GET_SIZE
4621#define STRINGLIB_NEW PyUnicode_FromUnicode
4622#define STRINGLIB_STR PyUnicode_AS_UNICODE
4623
4624Py_LOCAL_INLINE(int)
4625STRINGLIB_CMP(const Py_UNICODE* str, const Py_UNICODE* other, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004626{
Thomas Wouters477c8d52006-05-27 19:21:47 +00004627 if (str[0] != other[0])
4628 return 1;
4629 return memcmp((void*) str, (void*) other, len * sizeof(Py_UNICODE));
Guido van Rossumd57fd912000-03-10 22:53:23 +00004630}
4631
Thomas Wouters477c8d52006-05-27 19:21:47 +00004632#define STRINGLIB_EMPTY unicode_empty
4633
4634#include "stringlib/fastsearch.h"
4635
4636#include "stringlib/count.h"
4637#include "stringlib/find.h"
4638#include "stringlib/partition.h"
4639
4640/* helper macro to fixup start/end slice values */
4641#define FIX_START_END(obj) \
4642 if (start < 0) \
4643 start += (obj)->length; \
4644 if (start < 0) \
4645 start = 0; \
4646 if (end > (obj)->length) \
4647 end = (obj)->length; \
4648 if (end < 0) \
4649 end += (obj)->length; \
4650 if (end < 0) \
4651 end = 0;
4652
Martin v. Löwis18e16552006-02-15 17:27:45 +00004653Py_ssize_t PyUnicode_Count(PyObject *str,
Thomas Wouters477c8d52006-05-27 19:21:47 +00004654 PyObject *substr,
4655 Py_ssize_t start,
4656 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004657{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004658 Py_ssize_t result;
Thomas Wouters477c8d52006-05-27 19:21:47 +00004659 PyUnicodeObject* str_obj;
4660 PyUnicodeObject* sub_obj;
Tim Petersced69f82003-09-16 20:30:58 +00004661
Thomas Wouters477c8d52006-05-27 19:21:47 +00004662 str_obj = (PyUnicodeObject*) PyUnicode_FromObject(str);
4663 if (!str_obj)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004664 return -1;
Thomas Wouters477c8d52006-05-27 19:21:47 +00004665 sub_obj = (PyUnicodeObject*) PyUnicode_FromObject(substr);
4666 if (!sub_obj) {
4667 Py_DECREF(str_obj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004668 return -1;
4669 }
Tim Petersced69f82003-09-16 20:30:58 +00004670
Thomas Wouters477c8d52006-05-27 19:21:47 +00004671 FIX_START_END(str_obj);
Tim Petersced69f82003-09-16 20:30:58 +00004672
Thomas Wouters477c8d52006-05-27 19:21:47 +00004673 result = stringlib_count(
4674 str_obj->str + start, end - start, sub_obj->str, sub_obj->length
4675 );
4676
4677 Py_DECREF(sub_obj);
4678 Py_DECREF(str_obj);
4679
Guido van Rossumd57fd912000-03-10 22:53:23 +00004680 return result;
4681}
4682
Martin v. Löwis18e16552006-02-15 17:27:45 +00004683Py_ssize_t PyUnicode_Find(PyObject *str,
Thomas Wouters477c8d52006-05-27 19:21:47 +00004684 PyObject *sub,
4685 Py_ssize_t start,
4686 Py_ssize_t end,
4687 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004688{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004689 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00004690
Guido van Rossumd57fd912000-03-10 22:53:23 +00004691 str = PyUnicode_FromObject(str);
Thomas Wouters477c8d52006-05-27 19:21:47 +00004692 if (!str)
Marc-André Lemburg4da6fd62002-05-29 11:33:13 +00004693 return -2;
Thomas Wouters477c8d52006-05-27 19:21:47 +00004694 sub = PyUnicode_FromObject(sub);
4695 if (!sub) {
Marc-André Lemburg41644392002-05-29 13:46:29 +00004696 Py_DECREF(str);
Marc-André Lemburg4da6fd62002-05-29 11:33:13 +00004697 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004698 }
Tim Petersced69f82003-09-16 20:30:58 +00004699
Thomas Wouters477c8d52006-05-27 19:21:47 +00004700 if (direction > 0)
4701 result = stringlib_find_slice(
4702 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
4703 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
4704 start, end
4705 );
4706 else
4707 result = stringlib_rfind_slice(
4708 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
4709 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
4710 start, end
4711 );
4712
Guido van Rossumd57fd912000-03-10 22:53:23 +00004713 Py_DECREF(str);
Thomas Wouters477c8d52006-05-27 19:21:47 +00004714 Py_DECREF(sub);
4715
Guido van Rossumd57fd912000-03-10 22:53:23 +00004716 return result;
4717}
4718
Tim Petersced69f82003-09-16 20:30:58 +00004719static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004720int tailmatch(PyUnicodeObject *self,
4721 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004722 Py_ssize_t start,
4723 Py_ssize_t end,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004724 int direction)
4725{
Guido van Rossumd57fd912000-03-10 22:53:23 +00004726 if (substring->length == 0)
4727 return 1;
4728
Thomas Wouters477c8d52006-05-27 19:21:47 +00004729 FIX_START_END(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004730
4731 end -= substring->length;
4732 if (end < start)
4733 return 0;
4734
4735 if (direction > 0) {
4736 if (Py_UNICODE_MATCH(self, end, substring))
4737 return 1;
4738 } else {
4739 if (Py_UNICODE_MATCH(self, start, substring))
4740 return 1;
4741 }
4742
4743 return 0;
4744}
4745
Martin v. Löwis18e16552006-02-15 17:27:45 +00004746Py_ssize_t PyUnicode_Tailmatch(PyObject *str,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004747 PyObject *substr,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004748 Py_ssize_t start,
4749 Py_ssize_t end,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004750 int direction)
4751{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004752 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00004753
Guido van Rossumd57fd912000-03-10 22:53:23 +00004754 str = PyUnicode_FromObject(str);
4755 if (str == NULL)
4756 return -1;
4757 substr = PyUnicode_FromObject(substr);
4758 if (substr == NULL) {
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00004759 Py_DECREF(str);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004760 return -1;
4761 }
Tim Petersced69f82003-09-16 20:30:58 +00004762
Guido van Rossumd57fd912000-03-10 22:53:23 +00004763 result = tailmatch((PyUnicodeObject *)str,
4764 (PyUnicodeObject *)substr,
4765 start, end, direction);
4766 Py_DECREF(str);
4767 Py_DECREF(substr);
4768 return result;
4769}
4770
Guido van Rossumd57fd912000-03-10 22:53:23 +00004771/* Apply fixfct filter to the Unicode object self and return a
4772 reference to the modified object */
4773
Tim Petersced69f82003-09-16 20:30:58 +00004774static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004775PyObject *fixup(PyUnicodeObject *self,
4776 int (*fixfct)(PyUnicodeObject *s))
4777{
4778
4779 PyUnicodeObject *u;
4780
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004781 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004782 if (u == NULL)
4783 return NULL;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004784
4785 Py_UNICODE_COPY(u->str, self->str, self->length);
4786
Tim Peters7a29bd52001-09-12 03:03:31 +00004787 if (!fixfct(u) && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004788 /* fixfct should return TRUE if it modified the buffer. If
4789 FALSE, return a reference to the original buffer instead
4790 (to save space, not time) */
4791 Py_INCREF(self);
4792 Py_DECREF(u);
4793 return (PyObject*) self;
4794 }
4795 return (PyObject*) u;
4796}
4797
Tim Petersced69f82003-09-16 20:30:58 +00004798static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004799int fixupper(PyUnicodeObject *self)
4800{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004801 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004802 Py_UNICODE *s = self->str;
4803 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00004804
Guido van Rossumd57fd912000-03-10 22:53:23 +00004805 while (len-- > 0) {
4806 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00004807
Guido van Rossumd57fd912000-03-10 22:53:23 +00004808 ch = Py_UNICODE_TOUPPER(*s);
4809 if (ch != *s) {
4810 status = 1;
4811 *s = ch;
4812 }
4813 s++;
4814 }
4815
4816 return status;
4817}
4818
Tim Petersced69f82003-09-16 20:30:58 +00004819static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004820int fixlower(PyUnicodeObject *self)
4821{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004822 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004823 Py_UNICODE *s = self->str;
4824 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00004825
Guido van Rossumd57fd912000-03-10 22:53:23 +00004826 while (len-- > 0) {
4827 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00004828
Guido van Rossumd57fd912000-03-10 22:53:23 +00004829 ch = Py_UNICODE_TOLOWER(*s);
4830 if (ch != *s) {
4831 status = 1;
4832 *s = ch;
4833 }
4834 s++;
4835 }
4836
4837 return status;
4838}
4839
Tim Petersced69f82003-09-16 20:30:58 +00004840static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004841int fixswapcase(PyUnicodeObject *self)
4842{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004843 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004844 Py_UNICODE *s = self->str;
4845 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00004846
Guido van Rossumd57fd912000-03-10 22:53:23 +00004847 while (len-- > 0) {
4848 if (Py_UNICODE_ISUPPER(*s)) {
4849 *s = Py_UNICODE_TOLOWER(*s);
4850 status = 1;
4851 } else if (Py_UNICODE_ISLOWER(*s)) {
4852 *s = Py_UNICODE_TOUPPER(*s);
4853 status = 1;
4854 }
4855 s++;
4856 }
4857
4858 return status;
4859}
4860
Tim Petersced69f82003-09-16 20:30:58 +00004861static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004862int fixcapitalize(PyUnicodeObject *self)
4863{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004864 Py_ssize_t len = self->length;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00004865 Py_UNICODE *s = self->str;
4866 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00004867
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00004868 if (len == 0)
4869 return 0;
4870 if (Py_UNICODE_ISLOWER(*s)) {
4871 *s = Py_UNICODE_TOUPPER(*s);
4872 status = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004873 }
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00004874 s++;
4875 while (--len > 0) {
4876 if (Py_UNICODE_ISUPPER(*s)) {
4877 *s = Py_UNICODE_TOLOWER(*s);
4878 status = 1;
4879 }
4880 s++;
4881 }
4882 return status;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004883}
4884
4885static
4886int fixtitle(PyUnicodeObject *self)
4887{
4888 register Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
4889 register Py_UNICODE *e;
4890 int previous_is_cased;
4891
4892 /* Shortcut for single character strings */
4893 if (PyUnicode_GET_SIZE(self) == 1) {
4894 Py_UNICODE ch = Py_UNICODE_TOTITLE(*p);
4895 if (*p != ch) {
4896 *p = ch;
4897 return 1;
4898 }
4899 else
4900 return 0;
4901 }
Tim Petersced69f82003-09-16 20:30:58 +00004902
Guido van Rossumd57fd912000-03-10 22:53:23 +00004903 e = p + PyUnicode_GET_SIZE(self);
4904 previous_is_cased = 0;
4905 for (; p < e; p++) {
4906 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00004907
Guido van Rossumd57fd912000-03-10 22:53:23 +00004908 if (previous_is_cased)
4909 *p = Py_UNICODE_TOLOWER(ch);
4910 else
4911 *p = Py_UNICODE_TOTITLE(ch);
Tim Petersced69f82003-09-16 20:30:58 +00004912
4913 if (Py_UNICODE_ISLOWER(ch) ||
4914 Py_UNICODE_ISUPPER(ch) ||
Guido van Rossumd57fd912000-03-10 22:53:23 +00004915 Py_UNICODE_ISTITLE(ch))
4916 previous_is_cased = 1;
4917 else
4918 previous_is_cased = 0;
4919 }
4920 return 1;
4921}
4922
Tim Peters8ce9f162004-08-27 01:49:32 +00004923PyObject *
4924PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004925{
Tim Peters8ce9f162004-08-27 01:49:32 +00004926 PyObject *internal_separator = NULL;
Skip Montanaro6543b452004-09-16 03:28:13 +00004927 const Py_UNICODE blank = ' ';
4928 const Py_UNICODE *sep = &blank;
Thomas Wouters477c8d52006-05-27 19:21:47 +00004929 Py_ssize_t seplen = 1;
Tim Peters05eba1f2004-08-27 21:32:02 +00004930 PyUnicodeObject *res = NULL; /* the result */
Thomas Wouters477c8d52006-05-27 19:21:47 +00004931 Py_ssize_t res_alloc = 100; /* # allocated bytes for string in res */
4932 Py_ssize_t res_used; /* # used bytes */
Tim Peters05eba1f2004-08-27 21:32:02 +00004933 Py_UNICODE *res_p; /* pointer to free byte in res's string area */
4934 PyObject *fseq; /* PySequence_Fast(seq) */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004935 Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */
Tim Peters8ce9f162004-08-27 01:49:32 +00004936 PyObject *item;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004937 Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004938
Tim Peters05eba1f2004-08-27 21:32:02 +00004939 fseq = PySequence_Fast(seq, "");
4940 if (fseq == NULL) {
Tim Peters05eba1f2004-08-27 21:32:02 +00004941 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +00004942 }
4943
Tim Peters91879ab2004-08-27 22:35:44 +00004944 /* Grrrr. A codec may be invoked to convert str objects to
4945 * Unicode, and so it's possible to call back into Python code
4946 * during PyUnicode_FromObject(), and so it's possible for a sick
4947 * codec to change the size of fseq (if seq is a list). Therefore
4948 * we have to keep refetching the size -- can't assume seqlen
4949 * is invariant.
4950 */
Tim Peters05eba1f2004-08-27 21:32:02 +00004951 seqlen = PySequence_Fast_GET_SIZE(fseq);
4952 /* If empty sequence, return u"". */
4953 if (seqlen == 0) {
4954 res = _PyUnicode_New(0); /* empty sequence; return u"" */
4955 goto Done;
4956 }
4957 /* If singleton sequence with an exact Unicode, return that. */
4958 if (seqlen == 1) {
4959 item = PySequence_Fast_GET_ITEM(fseq, 0);
4960 if (PyUnicode_CheckExact(item)) {
4961 Py_INCREF(item);
4962 res = (PyUnicodeObject *)item;
4963 goto Done;
4964 }
Tim Peters8ce9f162004-08-27 01:49:32 +00004965 }
4966
Tim Peters05eba1f2004-08-27 21:32:02 +00004967 /* At least two items to join, or one that isn't exact Unicode. */
4968 if (seqlen > 1) {
4969 /* Set up sep and seplen -- they're needed. */
4970 if (separator == NULL) {
4971 sep = &blank;
4972 seplen = 1;
4973 }
4974 else {
4975 internal_separator = PyUnicode_FromObject(separator);
4976 if (internal_separator == NULL)
4977 goto onError;
4978 sep = PyUnicode_AS_UNICODE(internal_separator);
4979 seplen = PyUnicode_GET_SIZE(internal_separator);
Tim Peters91879ab2004-08-27 22:35:44 +00004980 /* In case PyUnicode_FromObject() mutated seq. */
4981 seqlen = PySequence_Fast_GET_SIZE(fseq);
Tim Peters05eba1f2004-08-27 21:32:02 +00004982 }
4983 }
4984
4985 /* Get space. */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004986 res = _PyUnicode_New(res_alloc);
Tim Peters05eba1f2004-08-27 21:32:02 +00004987 if (res == NULL)
Tim Peters8ce9f162004-08-27 01:49:32 +00004988 goto onError;
Tim Peters05eba1f2004-08-27 21:32:02 +00004989 res_p = PyUnicode_AS_UNICODE(res);
4990 res_used = 0;
Tim Peters8ce9f162004-08-27 01:49:32 +00004991
Tim Peters05eba1f2004-08-27 21:32:02 +00004992 for (i = 0; i < seqlen; ++i) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00004993 Py_ssize_t itemlen;
4994 Py_ssize_t new_res_used;
Tim Peters05eba1f2004-08-27 21:32:02 +00004995
4996 item = PySequence_Fast_GET_ITEM(fseq, i);
4997 /* Convert item to Unicode. */
4998 if (! PyUnicode_Check(item) && ! PyString_Check(item)) {
4999 PyErr_Format(PyExc_TypeError,
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005000 "sequence item %zd: expected string or Unicode,"
Tim Peters05eba1f2004-08-27 21:32:02 +00005001 " %.80s found",
5002 i, item->ob_type->tp_name);
Tim Peters2cfe3682001-05-05 05:36:48 +00005003 goto onError;
Tim Peters8ce9f162004-08-27 01:49:32 +00005004 }
Tim Peters05eba1f2004-08-27 21:32:02 +00005005 item = PyUnicode_FromObject(item);
5006 if (item == NULL)
5007 goto onError;
5008 /* We own a reference to item from here on. */
Tim Petersced69f82003-09-16 20:30:58 +00005009
Tim Peters91879ab2004-08-27 22:35:44 +00005010 /* In case PyUnicode_FromObject() mutated seq. */
5011 seqlen = PySequence_Fast_GET_SIZE(fseq);
5012
Tim Peters8ce9f162004-08-27 01:49:32 +00005013 /* Make sure we have enough space for the separator and the item. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005014 itemlen = PyUnicode_GET_SIZE(item);
Tim Peters05eba1f2004-08-27 21:32:02 +00005015 new_res_used = res_used + itemlen;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005016 if (new_res_used < 0)
Tim Peters8ce9f162004-08-27 01:49:32 +00005017 goto Overflow;
Tim Peters05eba1f2004-08-27 21:32:02 +00005018 if (i < seqlen - 1) {
5019 new_res_used += seplen;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005020 if (new_res_used < 0)
Tim Peters05eba1f2004-08-27 21:32:02 +00005021 goto Overflow;
5022 }
5023 if (new_res_used > res_alloc) {
5024 /* double allocated size until it's big enough */
Tim Peters8ce9f162004-08-27 01:49:32 +00005025 do {
Tim Peters05eba1f2004-08-27 21:32:02 +00005026 res_alloc += res_alloc;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005027 if (res_alloc <= 0)
Tim Peters8ce9f162004-08-27 01:49:32 +00005028 goto Overflow;
Tim Peters05eba1f2004-08-27 21:32:02 +00005029 } while (new_res_used > res_alloc);
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005030 if (_PyUnicode_Resize(&res, res_alloc) < 0) {
Marc-André Lemburg3508e302001-09-20 17:22:58 +00005031 Py_DECREF(item);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005032 goto onError;
Marc-André Lemburg3508e302001-09-20 17:22:58 +00005033 }
Tim Peters05eba1f2004-08-27 21:32:02 +00005034 res_p = PyUnicode_AS_UNICODE(res) + res_used;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005035 }
Tim Peters05eba1f2004-08-27 21:32:02 +00005036
5037 /* Copy item, and maybe the separator. */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005038 Py_UNICODE_COPY(res_p, PyUnicode_AS_UNICODE(item), itemlen);
Tim Peters05eba1f2004-08-27 21:32:02 +00005039 res_p += itemlen;
5040 if (i < seqlen - 1) {
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005041 Py_UNICODE_COPY(res_p, sep, seplen);
Tim Peters05eba1f2004-08-27 21:32:02 +00005042 res_p += seplen;
5043 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005044 Py_DECREF(item);
Tim Peters05eba1f2004-08-27 21:32:02 +00005045 res_used = new_res_used;
5046 }
Tim Peters8ce9f162004-08-27 01:49:32 +00005047
Tim Peters05eba1f2004-08-27 21:32:02 +00005048 /* Shrink res to match the used area; this probably can't fail,
5049 * but it's cheap to check.
5050 */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005051 if (_PyUnicode_Resize(&res, res_used) < 0)
Tim Peters8ce9f162004-08-27 01:49:32 +00005052 goto onError;
5053
5054 Done:
5055 Py_XDECREF(internal_separator);
Tim Peters05eba1f2004-08-27 21:32:02 +00005056 Py_DECREF(fseq);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005057 return (PyObject *)res;
5058
Tim Peters8ce9f162004-08-27 01:49:32 +00005059 Overflow:
5060 PyErr_SetString(PyExc_OverflowError,
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005061 "join() result is too long for a Python string");
Tim Peters8ce9f162004-08-27 01:49:32 +00005062 Py_DECREF(item);
5063 /* fall through */
5064
Guido van Rossumd57fd912000-03-10 22:53:23 +00005065 onError:
Tim Peters8ce9f162004-08-27 01:49:32 +00005066 Py_XDECREF(internal_separator);
Tim Peters05eba1f2004-08-27 21:32:02 +00005067 Py_DECREF(fseq);
Tim Peters8ce9f162004-08-27 01:49:32 +00005068 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005069 return NULL;
5070}
5071
Tim Petersced69f82003-09-16 20:30:58 +00005072static
5073PyUnicodeObject *pad(PyUnicodeObject *self,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005074 Py_ssize_t left,
5075 Py_ssize_t right,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005076 Py_UNICODE fill)
5077{
5078 PyUnicodeObject *u;
5079
5080 if (left < 0)
5081 left = 0;
5082 if (right < 0)
5083 right = 0;
5084
Tim Peters7a29bd52001-09-12 03:03:31 +00005085 if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005086 Py_INCREF(self);
5087 return self;
5088 }
5089
5090 u = _PyUnicode_New(left + self->length + right);
5091 if (u) {
5092 if (left)
5093 Py_UNICODE_FILL(u->str, fill, left);
5094 Py_UNICODE_COPY(u->str + left, self->str, self->length);
5095 if (right)
5096 Py_UNICODE_FILL(u->str + left + self->length, fill, right);
5097 }
5098
5099 return u;
5100}
5101
5102#define SPLIT_APPEND(data, left, right) \
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005103 str = PyUnicode_FromUnicode((data) + (left), (right) - (left)); \
Guido van Rossumd57fd912000-03-10 22:53:23 +00005104 if (!str) \
5105 goto onError; \
5106 if (PyList_Append(list, str)) { \
5107 Py_DECREF(str); \
5108 goto onError; \
5109 } \
5110 else \
5111 Py_DECREF(str);
5112
5113static
5114PyObject *split_whitespace(PyUnicodeObject *self,
5115 PyObject *list,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005116 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005117{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005118 register Py_ssize_t i;
5119 register Py_ssize_t j;
5120 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005121 PyObject *str;
5122
5123 for (i = j = 0; i < len; ) {
5124 /* find a token */
5125 while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
5126 i++;
5127 j = i;
5128 while (i < len && !Py_UNICODE_ISSPACE(self->str[i]))
5129 i++;
5130 if (j < i) {
5131 if (maxcount-- <= 0)
5132 break;
5133 SPLIT_APPEND(self->str, j, i);
5134 while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
5135 i++;
5136 j = i;
5137 }
5138 }
5139 if (j < len) {
5140 SPLIT_APPEND(self->str, j, len);
5141 }
5142 return list;
5143
5144 onError:
5145 Py_DECREF(list);
5146 return NULL;
5147}
5148
5149PyObject *PyUnicode_Splitlines(PyObject *string,
Guido van Rossum86662912000-04-11 15:38:46 +00005150 int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005151{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005152 register Py_ssize_t i;
5153 register Py_ssize_t j;
5154 Py_ssize_t len;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005155 PyObject *list;
5156 PyObject *str;
5157 Py_UNICODE *data;
5158
5159 string = PyUnicode_FromObject(string);
5160 if (string == NULL)
5161 return NULL;
5162 data = PyUnicode_AS_UNICODE(string);
5163 len = PyUnicode_GET_SIZE(string);
5164
Guido van Rossumd57fd912000-03-10 22:53:23 +00005165 list = PyList_New(0);
5166 if (!list)
5167 goto onError;
5168
5169 for (i = j = 0; i < len; ) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00005170 Py_ssize_t eol;
Tim Petersced69f82003-09-16 20:30:58 +00005171
Guido van Rossumd57fd912000-03-10 22:53:23 +00005172 /* Find a line and append it */
Thomas Wouters477c8d52006-05-27 19:21:47 +00005173 while (i < len && !BLOOM_LINEBREAK(data[i]))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005174 i++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005175
5176 /* Skip the line break reading CRLF as one line break */
Guido van Rossum86662912000-04-11 15:38:46 +00005177 eol = i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005178 if (i < len) {
5179 if (data[i] == '\r' && i + 1 < len &&
5180 data[i+1] == '\n')
5181 i += 2;
5182 else
5183 i++;
Guido van Rossum86662912000-04-11 15:38:46 +00005184 if (keepends)
5185 eol = i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005186 }
Guido van Rossum86662912000-04-11 15:38:46 +00005187 SPLIT_APPEND(data, j, eol);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005188 j = i;
5189 }
5190 if (j < len) {
5191 SPLIT_APPEND(data, j, len);
5192 }
5193
5194 Py_DECREF(string);
5195 return list;
5196
5197 onError:
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00005198 Py_XDECREF(list);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005199 Py_DECREF(string);
5200 return NULL;
5201}
5202
Tim Petersced69f82003-09-16 20:30:58 +00005203static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005204PyObject *split_char(PyUnicodeObject *self,
5205 PyObject *list,
5206 Py_UNICODE ch,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005207 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005208{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005209 register Py_ssize_t i;
5210 register Py_ssize_t j;
5211 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005212 PyObject *str;
5213
5214 for (i = j = 0; i < len; ) {
5215 if (self->str[i] == ch) {
5216 if (maxcount-- <= 0)
5217 break;
5218 SPLIT_APPEND(self->str, j, i);
5219 i = j = i + 1;
5220 } else
5221 i++;
5222 }
5223 if (j <= len) {
5224 SPLIT_APPEND(self->str, j, len);
5225 }
5226 return list;
5227
5228 onError:
5229 Py_DECREF(list);
5230 return NULL;
5231}
5232
Tim Petersced69f82003-09-16 20:30:58 +00005233static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005234PyObject *split_substring(PyUnicodeObject *self,
5235 PyObject *list,
5236 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005237 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005238{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005239 register Py_ssize_t i;
5240 register Py_ssize_t j;
5241 Py_ssize_t len = self->length;
5242 Py_ssize_t sublen = substring->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005243 PyObject *str;
5244
Guido van Rossumcda4f9a2000-12-19 02:23:19 +00005245 for (i = j = 0; i <= len - sublen; ) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005246 if (Py_UNICODE_MATCH(self, i, substring)) {
5247 if (maxcount-- <= 0)
5248 break;
5249 SPLIT_APPEND(self->str, j, i);
5250 i = j = i + sublen;
5251 } else
5252 i++;
5253 }
5254 if (j <= len) {
5255 SPLIT_APPEND(self->str, j, len);
5256 }
5257 return list;
5258
5259 onError:
5260 Py_DECREF(list);
5261 return NULL;
5262}
5263
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005264static
5265PyObject *rsplit_whitespace(PyUnicodeObject *self,
5266 PyObject *list,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005267 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005268{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005269 register Py_ssize_t i;
5270 register Py_ssize_t j;
5271 Py_ssize_t len = self->length;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005272 PyObject *str;
5273
5274 for (i = j = len - 1; i >= 0; ) {
5275 /* find a token */
5276 while (i >= 0 && Py_UNICODE_ISSPACE(self->str[i]))
5277 i--;
5278 j = i;
5279 while (i >= 0 && !Py_UNICODE_ISSPACE(self->str[i]))
5280 i--;
5281 if (j > i) {
5282 if (maxcount-- <= 0)
5283 break;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005284 SPLIT_APPEND(self->str, i + 1, j + 1);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005285 while (i >= 0 && Py_UNICODE_ISSPACE(self->str[i]))
5286 i--;
5287 j = i;
5288 }
5289 }
5290 if (j >= 0) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00005291 SPLIT_APPEND(self->str, 0, j + 1);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005292 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00005293 if (PyList_Reverse(list) < 0)
5294 goto onError;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005295 return list;
5296
5297 onError:
5298 Py_DECREF(list);
5299 return NULL;
5300}
5301
5302static
5303PyObject *rsplit_char(PyUnicodeObject *self,
5304 PyObject *list,
5305 Py_UNICODE ch,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005306 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005307{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005308 register Py_ssize_t i;
5309 register Py_ssize_t j;
5310 Py_ssize_t len = self->length;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005311 PyObject *str;
5312
5313 for (i = j = len - 1; i >= 0; ) {
5314 if (self->str[i] == ch) {
5315 if (maxcount-- <= 0)
5316 break;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005317 SPLIT_APPEND(self->str, i + 1, j + 1);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005318 j = i = i - 1;
5319 } else
5320 i--;
5321 }
Hye-Shik Chang7fc4cf52003-12-23 09:10:16 +00005322 if (j >= -1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00005323 SPLIT_APPEND(self->str, 0, j + 1);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005324 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00005325 if (PyList_Reverse(list) < 0)
5326 goto onError;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005327 return list;
5328
5329 onError:
5330 Py_DECREF(list);
5331 return NULL;
5332}
5333
5334static
5335PyObject *rsplit_substring(PyUnicodeObject *self,
5336 PyObject *list,
5337 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005338 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005339{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005340 register Py_ssize_t i;
5341 register Py_ssize_t j;
5342 Py_ssize_t len = self->length;
5343 Py_ssize_t sublen = substring->length;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005344 PyObject *str;
5345
5346 for (i = len - sublen, j = len; i >= 0; ) {
5347 if (Py_UNICODE_MATCH(self, i, substring)) {
5348 if (maxcount-- <= 0)
5349 break;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005350 SPLIT_APPEND(self->str, i + sublen, j);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005351 j = i;
5352 i -= sublen;
5353 } else
5354 i--;
5355 }
5356 if (j >= 0) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00005357 SPLIT_APPEND(self->str, 0, j);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005358 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00005359 if (PyList_Reverse(list) < 0)
5360 goto onError;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005361 return list;
5362
5363 onError:
5364 Py_DECREF(list);
5365 return NULL;
5366}
5367
Guido van Rossumd57fd912000-03-10 22:53:23 +00005368#undef SPLIT_APPEND
5369
5370static
5371PyObject *split(PyUnicodeObject *self,
5372 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005373 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005374{
5375 PyObject *list;
5376
5377 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005378 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005379
5380 list = PyList_New(0);
5381 if (!list)
5382 return NULL;
5383
5384 if (substring == NULL)
5385 return split_whitespace(self,list,maxcount);
5386
5387 else if (substring->length == 1)
5388 return split_char(self,list,substring->str[0],maxcount);
5389
5390 else if (substring->length == 0) {
5391 Py_DECREF(list);
5392 PyErr_SetString(PyExc_ValueError, "empty separator");
5393 return NULL;
5394 }
5395 else
5396 return split_substring(self,list,substring,maxcount);
5397}
5398
Tim Petersced69f82003-09-16 20:30:58 +00005399static
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005400PyObject *rsplit(PyUnicodeObject *self,
5401 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005402 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005403{
5404 PyObject *list;
5405
5406 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005407 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005408
5409 list = PyList_New(0);
5410 if (!list)
5411 return NULL;
5412
5413 if (substring == NULL)
5414 return rsplit_whitespace(self,list,maxcount);
5415
5416 else if (substring->length == 1)
5417 return rsplit_char(self,list,substring->str[0],maxcount);
5418
5419 else if (substring->length == 0) {
5420 Py_DECREF(list);
5421 PyErr_SetString(PyExc_ValueError, "empty separator");
5422 return NULL;
5423 }
5424 else
5425 return rsplit_substring(self,list,substring,maxcount);
5426}
5427
5428static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005429PyObject *replace(PyUnicodeObject *self,
5430 PyUnicodeObject *str1,
5431 PyUnicodeObject *str2,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005432 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005433{
5434 PyUnicodeObject *u;
5435
5436 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005437 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005438
Thomas Wouters477c8d52006-05-27 19:21:47 +00005439 if (str1->length == str2->length) {
5440 /* same length */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005441 Py_ssize_t i;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005442 if (str1->length == 1) {
5443 /* replace characters */
5444 Py_UNICODE u1, u2;
5445 if (!findchar(self->str, self->length, str1->str[0]))
5446 goto nothing;
5447 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
5448 if (!u)
5449 return NULL;
5450 Py_UNICODE_COPY(u->str, self->str, self->length);
5451 u1 = str1->str[0];
5452 u2 = str2->str[0];
5453 for (i = 0; i < u->length; i++)
5454 if (u->str[i] == u1) {
5455 if (--maxcount < 0)
5456 break;
5457 u->str[i] = u2;
5458 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005459 } else {
Thomas Wouters477c8d52006-05-27 19:21:47 +00005460 i = fastsearch(
5461 self->str, self->length, str1->str, str1->length, FAST_SEARCH
Guido van Rossumd57fd912000-03-10 22:53:23 +00005462 );
Thomas Wouters477c8d52006-05-27 19:21:47 +00005463 if (i < 0)
5464 goto nothing;
5465 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
5466 if (!u)
5467 return NULL;
5468 Py_UNICODE_COPY(u->str, self->str, self->length);
5469 while (i <= self->length - str1->length)
5470 if (Py_UNICODE_MATCH(self, i, str1)) {
5471 if (--maxcount < 0)
5472 break;
5473 Py_UNICODE_COPY(u->str+i, str2->str, str2->length);
5474 i += str1->length;
5475 } else
5476 i++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005477 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005478 } else {
Thomas Wouters477c8d52006-05-27 19:21:47 +00005479
5480 Py_ssize_t n, i, j, e;
5481 Py_ssize_t product, new_size, delta;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005482 Py_UNICODE *p;
5483
5484 /* replace strings */
Thomas Wouters477c8d52006-05-27 19:21:47 +00005485 n = stringlib_count(self->str, self->length, str1->str, str1->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005486 if (n > maxcount)
5487 n = maxcount;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005488 if (n == 0)
5489 goto nothing;
5490 /* new_size = self->length + n * (str2->length - str1->length)); */
5491 delta = (str2->length - str1->length);
5492 if (delta == 0) {
5493 new_size = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005494 } else {
Thomas Wouters477c8d52006-05-27 19:21:47 +00005495 product = n * (str2->length - str1->length);
5496 if ((product / (str2->length - str1->length)) != n) {
5497 PyErr_SetString(PyExc_OverflowError,
5498 "replace string is too long");
5499 return NULL;
5500 }
5501 new_size = self->length + product;
5502 if (new_size < 0) {
5503 PyErr_SetString(PyExc_OverflowError,
5504 "replace string is too long");
5505 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005506 }
5507 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00005508 u = _PyUnicode_New(new_size);
5509 if (!u)
5510 return NULL;
5511 i = 0;
5512 p = u->str;
5513 e = self->length - str1->length;
5514 if (str1->length > 0) {
5515 while (n-- > 0) {
5516 /* look for next match */
5517 j = i;
5518 while (j <= e) {
5519 if (Py_UNICODE_MATCH(self, j, str1))
5520 break;
5521 j++;
5522 }
5523 if (j > i) {
5524 if (j > e)
5525 break;
5526 /* copy unchanged part [i:j] */
5527 Py_UNICODE_COPY(p, self->str+i, j-i);
5528 p += j - i;
5529 }
5530 /* copy substitution string */
5531 if (str2->length > 0) {
5532 Py_UNICODE_COPY(p, str2->str, str2->length);
5533 p += str2->length;
5534 }
5535 i = j + str1->length;
5536 }
5537 if (i < self->length)
5538 /* copy tail [i:] */
5539 Py_UNICODE_COPY(p, self->str+i, self->length-i);
5540 } else {
5541 /* interleave */
5542 while (n > 0) {
5543 Py_UNICODE_COPY(p, str2->str, str2->length);
5544 p += str2->length;
5545 if (--n <= 0)
5546 break;
5547 *p++ = self->str[i++];
5548 }
5549 Py_UNICODE_COPY(p, self->str+i, self->length-i);
5550 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005551 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005552 return (PyObject *) u;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005553
5554nothing:
5555 /* nothing to replace; return original string (when possible) */
5556 if (PyUnicode_CheckExact(self)) {
5557 Py_INCREF(self);
5558 return (PyObject *) self;
5559 }
5560 return PyUnicode_FromUnicode(self->str, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005561}
5562
5563/* --- Unicode Object Methods --------------------------------------------- */
5564
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005565PyDoc_STRVAR(title__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005566"S.title() -> unicode\n\
5567\n\
5568Return a titlecased version of S, i.e. words start with title case\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005569characters, all remaining cased characters have lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005570
5571static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005572unicode_title(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005573{
Guido van Rossumd57fd912000-03-10 22:53:23 +00005574 return fixup(self, fixtitle);
5575}
5576
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005577PyDoc_STRVAR(capitalize__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005578"S.capitalize() -> unicode\n\
5579\n\
5580Return a capitalized version of S, i.e. make the first character\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005581have upper case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005582
5583static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005584unicode_capitalize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005585{
Guido van Rossumd57fd912000-03-10 22:53:23 +00005586 return fixup(self, fixcapitalize);
5587}
5588
5589#if 0
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005590PyDoc_STRVAR(capwords__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005591"S.capwords() -> unicode\n\
5592\n\
5593Apply .capitalize() to all words in S and return the result with\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005594normalized whitespace (all whitespace strings are replaced by ' ').");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005595
5596static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005597unicode_capwords(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005598{
5599 PyObject *list;
5600 PyObject *item;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005601 Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005602
Guido van Rossumd57fd912000-03-10 22:53:23 +00005603 /* Split into words */
5604 list = split(self, NULL, -1);
5605 if (!list)
5606 return NULL;
5607
5608 /* Capitalize each word */
5609 for (i = 0; i < PyList_GET_SIZE(list); i++) {
5610 item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
5611 fixcapitalize);
5612 if (item == NULL)
5613 goto onError;
5614 Py_DECREF(PyList_GET_ITEM(list, i));
5615 PyList_SET_ITEM(list, i, item);
5616 }
5617
5618 /* Join the words to form a new string */
5619 item = PyUnicode_Join(NULL, list);
5620
5621onError:
5622 Py_DECREF(list);
5623 return (PyObject *)item;
5624}
5625#endif
5626
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00005627/* Argument converter. Coerces to a single unicode character */
5628
5629static int
5630convert_uc(PyObject *obj, void *addr)
5631{
5632 Py_UNICODE *fillcharloc = (Py_UNICODE *)addr;
5633 PyObject *uniobj;
5634 Py_UNICODE *unistr;
5635
5636 uniobj = PyUnicode_FromObject(obj);
5637 if (uniobj == NULL) {
5638 PyErr_SetString(PyExc_TypeError,
5639 "The fill character cannot be converted to Unicode");
5640 return 0;
5641 }
5642 if (PyUnicode_GET_SIZE(uniobj) != 1) {
5643 PyErr_SetString(PyExc_TypeError,
5644 "The fill character must be exactly one character long");
5645 Py_DECREF(uniobj);
5646 return 0;
5647 }
5648 unistr = PyUnicode_AS_UNICODE(uniobj);
5649 *fillcharloc = unistr[0];
5650 Py_DECREF(uniobj);
5651 return 1;
5652}
5653
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005654PyDoc_STRVAR(center__doc__,
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00005655"S.center(width[, fillchar]) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005656\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00005657Return S centered in a Unicode string of length width. Padding is\n\
5658done using the specified fill character (default is a space)");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005659
5660static PyObject *
5661unicode_center(PyUnicodeObject *self, PyObject *args)
5662{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005663 Py_ssize_t marg, left;
5664 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00005665 Py_UNICODE fillchar = ' ';
Guido van Rossumd57fd912000-03-10 22:53:23 +00005666
Thomas Woutersde017742006-02-16 19:34:37 +00005667 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005668 return NULL;
5669
Tim Peters7a29bd52001-09-12 03:03:31 +00005670 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005671 Py_INCREF(self);
5672 return (PyObject*) self;
5673 }
5674
5675 marg = width - self->length;
5676 left = marg / 2 + (marg & width & 1);
5677
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00005678 return (PyObject*) pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005679}
5680
Marc-André Lemburge5034372000-08-08 08:04:29 +00005681#if 0
5682
5683/* This code should go into some future Unicode collation support
5684 module. The basic comparison should compare ordinals on a naive
Trent Mick20abf572000-08-12 22:14:34 +00005685 basis (this is what Java does and thus JPython too). */
Marc-André Lemburge5034372000-08-08 08:04:29 +00005686
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005687/* speedy UTF-16 code point order comparison */
5688/* gleaned from: */
5689/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
5690
Marc-André Lemburge12896e2000-07-07 17:51:08 +00005691static short utf16Fixup[32] =
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005692{
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005693 0, 0, 0, 0, 0, 0, 0, 0,
Tim Petersced69f82003-09-16 20:30:58 +00005694 0, 0, 0, 0, 0, 0, 0, 0,
5695 0, 0, 0, 0, 0, 0, 0, 0,
Marc-André Lemburge12896e2000-07-07 17:51:08 +00005696 0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005697};
5698
Guido van Rossumd57fd912000-03-10 22:53:23 +00005699static int
5700unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
5701{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005702 Py_ssize_t len1, len2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005703
Guido van Rossumd57fd912000-03-10 22:53:23 +00005704 Py_UNICODE *s1 = str1->str;
5705 Py_UNICODE *s2 = str2->str;
5706
5707 len1 = str1->length;
5708 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00005709
Guido van Rossumd57fd912000-03-10 22:53:23 +00005710 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00005711 Py_UNICODE c1, c2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005712
5713 c1 = *s1++;
5714 c2 = *s2++;
Fredrik Lundh45714e92001-06-26 16:39:36 +00005715
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005716 if (c1 > (1<<11) * 26)
5717 c1 += utf16Fixup[c1>>11];
5718 if (c2 > (1<<11) * 26)
5719 c2 += utf16Fixup[c2>>11];
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005720 /* now c1 and c2 are in UTF-32-compatible order */
Fredrik Lundh45714e92001-06-26 16:39:36 +00005721
5722 if (c1 != c2)
5723 return (c1 < c2) ? -1 : 1;
Tim Petersced69f82003-09-16 20:30:58 +00005724
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005725 len1--; len2--;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005726 }
5727
5728 return (len1 < len2) ? -1 : (len1 != len2);
5729}
5730
Marc-André Lemburge5034372000-08-08 08:04:29 +00005731#else
5732
5733static int
5734unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
5735{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005736 register Py_ssize_t len1, len2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00005737
5738 Py_UNICODE *s1 = str1->str;
5739 Py_UNICODE *s2 = str2->str;
5740
5741 len1 = str1->length;
5742 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00005743
Marc-André Lemburge5034372000-08-08 08:04:29 +00005744 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00005745 Py_UNICODE c1, c2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00005746
Fredrik Lundh45714e92001-06-26 16:39:36 +00005747 c1 = *s1++;
5748 c2 = *s2++;
5749
5750 if (c1 != c2)
5751 return (c1 < c2) ? -1 : 1;
5752
Marc-André Lemburge5034372000-08-08 08:04:29 +00005753 len1--; len2--;
5754 }
5755
5756 return (len1 < len2) ? -1 : (len1 != len2);
5757}
5758
5759#endif
5760
Guido van Rossumd57fd912000-03-10 22:53:23 +00005761int PyUnicode_Compare(PyObject *left,
5762 PyObject *right)
5763{
Guido van Rossum09dc34f2007-05-04 04:17:33 +00005764 if (PyUnicode_Check(left) && PyUnicode_Check(right))
5765 return unicode_compare((PyUnicodeObject *)left,
5766 (PyUnicodeObject *)right);
5767 if ((PyString_Check(left) && PyUnicode_Check(right)) ||
5768 (PyUnicode_Check(left) && PyString_Check(right))) {
5769 if (PyUnicode_Check(left))
5770 left = _PyUnicode_AsDefaultEncodedString(left, NULL);
5771 if (PyUnicode_Check(right))
5772 right = _PyUnicode_AsDefaultEncodedString(right, NULL);
5773 assert(PyString_Check(left));
5774 assert(PyString_Check(right));
5775 return PyObject_Compare(left, right);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005776 }
Guido van Rossum09dc34f2007-05-04 04:17:33 +00005777 PyErr_Format(PyExc_TypeError,
5778 "Can't compare %.100s and %.100s",
5779 left->ob_type->tp_name,
5780 right->ob_type->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005781 return -1;
5782}
5783
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00005784PyObject *PyUnicode_RichCompare(PyObject *left,
5785 PyObject *right,
5786 int op)
5787{
5788 int result;
5789
5790 result = PyUnicode_Compare(left, right);
5791 if (result == -1 && PyErr_Occurred())
5792 goto onError;
5793
5794 /* Convert the return value to a Boolean */
5795 switch (op) {
5796 case Py_EQ:
5797 result = (result == 0);
5798 break;
5799 case Py_NE:
5800 result = (result != 0);
5801 break;
5802 case Py_LE:
5803 result = (result <= 0);
5804 break;
5805 case Py_GE:
5806 result = (result >= 0);
5807 break;
5808 case Py_LT:
5809 result = (result == -1);
5810 break;
5811 case Py_GT:
5812 result = (result == 1);
5813 break;
5814 }
5815 return PyBool_FromLong(result);
5816
5817 onError:
5818
5819 /* Standard case
5820
5821 Type errors mean that PyUnicode_FromObject() could not convert
5822 one of the arguments (usually the right hand side) to Unicode,
5823 ie. we can't handle the comparison request. However, it is
5824 possible that the other object knows a comparison method, which
5825 is why we return Py_NotImplemented to give the other object a
5826 chance.
5827
5828 */
5829 if (PyErr_ExceptionMatches(PyExc_TypeError)) {
5830 PyErr_Clear();
5831 Py_INCREF(Py_NotImplemented);
5832 return Py_NotImplemented;
5833 }
5834 if (op != Py_EQ && op != Py_NE)
5835 return NULL;
5836
5837 /* Equality comparison.
5838
5839 This is a special case: we silence any PyExc_UnicodeDecodeError
5840 and instead turn it into a PyErr_UnicodeWarning.
5841
5842 */
5843 if (!PyErr_ExceptionMatches(PyExc_UnicodeDecodeError))
5844 return NULL;
5845 PyErr_Clear();
5846 if (PyErr_Warn(PyExc_UnicodeWarning,
5847 (op == Py_EQ) ?
5848 "Unicode equal comparison "
5849 "failed to convert both arguments to Unicode - "
5850 "interpreting them as being unequal" :
5851 "Unicode unequal comparison "
5852 "failed to convert both arguments to Unicode - "
5853 "interpreting them as being unequal"
5854 ) < 0)
5855 return NULL;
5856 result = (op == Py_NE);
5857 return PyBool_FromLong(result);
5858}
5859
Guido van Rossum403d68b2000-03-13 15:55:09 +00005860int PyUnicode_Contains(PyObject *container,
5861 PyObject *element)
5862{
Thomas Wouters477c8d52006-05-27 19:21:47 +00005863 PyObject *str, *sub;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005864 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00005865
5866 /* Coerce the two arguments */
Thomas Wouters477c8d52006-05-27 19:21:47 +00005867 sub = PyUnicode_FromObject(element);
5868 if (!sub) {
Marc-André Lemburg7c014682000-06-28 08:11:47 +00005869 PyErr_SetString(PyExc_TypeError,
Barry Warsaw817918c2002-08-06 16:58:21 +00005870 "'in <string>' requires string as left operand");
Thomas Wouters477c8d52006-05-27 19:21:47 +00005871 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +00005872 }
5873
Thomas Wouters477c8d52006-05-27 19:21:47 +00005874 str = PyUnicode_FromObject(container);
5875 if (!str) {
5876 Py_DECREF(sub);
5877 return -1;
5878 }
5879
5880 result = stringlib_contains_obj(str, sub);
5881
5882 Py_DECREF(str);
5883 Py_DECREF(sub);
5884
Guido van Rossum403d68b2000-03-13 15:55:09 +00005885 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00005886}
5887
Guido van Rossumd57fd912000-03-10 22:53:23 +00005888/* Concat to string or Unicode object giving a new Unicode object. */
5889
5890PyObject *PyUnicode_Concat(PyObject *left,
5891 PyObject *right)
5892{
5893 PyUnicodeObject *u = NULL, *v = NULL, *w;
5894
Guido van Rossum84d79dd2007-04-13 02:23:57 +00005895 if (PyBytes_Check(left) || PyBytes_Check(right))
5896 return PyBytes_Concat(left, right);
5897
Guido van Rossumd57fd912000-03-10 22:53:23 +00005898 /* Coerce the two arguments */
5899 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
5900 if (u == NULL)
5901 goto onError;
5902 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
5903 if (v == NULL)
5904 goto onError;
5905
5906 /* Shortcuts */
5907 if (v == unicode_empty) {
5908 Py_DECREF(v);
5909 return (PyObject *)u;
5910 }
5911 if (u == unicode_empty) {
5912 Py_DECREF(u);
5913 return (PyObject *)v;
5914 }
5915
5916 /* Concat the two Unicode strings */
5917 w = _PyUnicode_New(u->length + v->length);
5918 if (w == NULL)
5919 goto onError;
5920 Py_UNICODE_COPY(w->str, u->str, u->length);
5921 Py_UNICODE_COPY(w->str + u->length, v->str, v->length);
5922
5923 Py_DECREF(u);
5924 Py_DECREF(v);
5925 return (PyObject *)w;
5926
5927onError:
5928 Py_XDECREF(u);
5929 Py_XDECREF(v);
5930 return NULL;
5931}
5932
Walter Dörwald1ab83302007-05-18 17:15:44 +00005933void
5934PyUnicode_Append(PyObject **pleft, PyObject *right)
5935{
5936 PyObject *new;
5937 if (*pleft == NULL)
5938 return;
5939 if (right == NULL || !PyUnicode_Check(*pleft)) {
5940 Py_DECREF(*pleft);
5941 *pleft = NULL;
5942 return;
5943 }
5944 new = PyUnicode_Concat(*pleft, right);
5945 Py_DECREF(*pleft);
5946 *pleft = new;
5947}
5948
5949void
5950PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
5951{
5952 PyUnicode_Append(pleft, right);
5953 Py_XDECREF(right);
5954}
5955
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005956PyDoc_STRVAR(count__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005957"S.count(sub[, start[, end]]) -> int\n\
5958\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00005959Return the number of non-overlapping occurrences of substring sub in\n\
5960Unicode string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005961interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005962
5963static PyObject *
5964unicode_count(PyUnicodeObject *self, PyObject *args)
5965{
5966 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005967 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005968 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005969 PyObject *result;
5970
Guido van Rossumb8872e62000-05-09 14:14:27 +00005971 if (!PyArg_ParseTuple(args, "O|O&O&:count", &substring,
5972 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005973 return NULL;
5974
5975 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Thomas Wouters477c8d52006-05-27 19:21:47 +00005976 (PyObject *)substring);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005977 if (substring == NULL)
5978 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +00005979
Thomas Wouters477c8d52006-05-27 19:21:47 +00005980 FIX_START_END(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005981
Thomas Wouters477c8d52006-05-27 19:21:47 +00005982 result = PyInt_FromSsize_t(
5983 stringlib_count(self->str + start, end - start,
5984 substring->str, substring->length)
5985 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00005986
5987 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00005988
Guido van Rossumd57fd912000-03-10 22:53:23 +00005989 return result;
5990}
5991
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005992PyDoc_STRVAR(encode__doc__,
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00005993"S.encode([encoding[,errors]]) -> string or unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005994\n\
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00005995Encodes S using the codec registered for encoding. encoding defaults\n\
5996to the default encoding. errors may be given to set a different error\n\
Fred Drakee4315f52000-05-09 19:53:39 +00005997handling scheme. Default is 'strict' meaning that encoding errors raise\n\
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005998a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
5999'xmlcharrefreplace' as well as any other name registered with\n\
6000codecs.register_error that can handle UnicodeEncodeErrors.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006001
6002static PyObject *
6003unicode_encode(PyUnicodeObject *self, PyObject *args)
6004{
6005 char *encoding = NULL;
6006 char *errors = NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006007 PyObject *v;
6008
Guido van Rossumd57fd912000-03-10 22:53:23 +00006009 if (!PyArg_ParseTuple(args, "|ss:encode", &encoding, &errors))
6010 return NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006011 v = PyUnicode_AsEncodedObject((PyObject *)self, encoding, errors);
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00006012 if (v == NULL)
6013 goto onError;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00006014 if (!PyBytes_Check(v)) {
Guido van Rossum4355a472007-05-04 05:00:04 +00006015 if (PyString_Check(v)) {
6016 /* Old codec, turn it into bytes */
6017 PyObject *b = PyBytes_FromObject(v);
6018 Py_DECREF(v);
6019 return b;
6020 }
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006021 PyErr_Format(PyExc_TypeError,
Guido van Rossumf15a29f2007-05-04 00:41:39 +00006022 "encoder did not return a bytes object "
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006023 "(type=%.400s)",
6024 v->ob_type->tp_name);
6025 Py_DECREF(v);
6026 return NULL;
6027 }
6028 return v;
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00006029
6030 onError:
6031 return NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006032}
6033
6034PyDoc_STRVAR(decode__doc__,
6035"S.decode([encoding[,errors]]) -> string or unicode\n\
6036\n\
6037Decodes S using the codec registered for encoding. encoding defaults\n\
6038to the default encoding. errors may be given to set a different error\n\
6039handling scheme. Default is 'strict' meaning that encoding errors raise\n\
6040a UnicodeDecodeError. Other possible values are 'ignore' and 'replace'\n\
6041as well as any other name registerd with codecs.register_error that is\n\
6042able to handle UnicodeDecodeErrors.");
6043
6044static PyObject *
Marc-André Lemburg126b44c2004-07-10 12:04:20 +00006045unicode_decode(PyUnicodeObject *self, PyObject *args)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006046{
6047 char *encoding = NULL;
6048 char *errors = NULL;
6049 PyObject *v;
6050
6051 if (!PyArg_ParseTuple(args, "|ss:decode", &encoding, &errors))
6052 return NULL;
6053 v = PyUnicode_AsDecodedObject((PyObject *)self, encoding, errors);
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00006054 if (v == NULL)
6055 goto onError;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006056 if (!PyString_Check(v) && !PyUnicode_Check(v)) {
6057 PyErr_Format(PyExc_TypeError,
6058 "decoder did not return a string/unicode object "
6059 "(type=%.400s)",
6060 v->ob_type->tp_name);
6061 Py_DECREF(v);
6062 return NULL;
6063 }
6064 return v;
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00006065
6066 onError:
6067 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006068}
6069
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006070PyDoc_STRVAR(expandtabs__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006071"S.expandtabs([tabsize]) -> unicode\n\
6072\n\
6073Return a copy of S where all tab characters are expanded using spaces.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006074If tabsize is not given, a tab size of 8 characters is assumed.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006075
6076static PyObject*
6077unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
6078{
6079 Py_UNICODE *e;
6080 Py_UNICODE *p;
6081 Py_UNICODE *q;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006082 Py_ssize_t i, j;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006083 PyUnicodeObject *u;
6084 int tabsize = 8;
6085
6086 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
6087 return NULL;
6088
Thomas Wouters7e474022000-07-16 12:04:32 +00006089 /* First pass: determine size of output string */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006090 i = j = 0;
6091 e = self->str + self->length;
6092 for (p = self->str; p < e; p++)
6093 if (*p == '\t') {
6094 if (tabsize > 0)
6095 j += tabsize - (j % tabsize);
6096 }
6097 else {
6098 j++;
6099 if (*p == '\n' || *p == '\r') {
6100 i += j;
6101 j = 0;
6102 }
6103 }
6104
6105 /* Second pass: create output string and fill it */
6106 u = _PyUnicode_New(i + j);
6107 if (!u)
6108 return NULL;
6109
6110 j = 0;
6111 q = u->str;
6112
6113 for (p = self->str; p < e; p++)
6114 if (*p == '\t') {
6115 if (tabsize > 0) {
6116 i = tabsize - (j % tabsize);
6117 j += i;
6118 while (i--)
6119 *q++ = ' ';
6120 }
6121 }
6122 else {
6123 j++;
6124 *q++ = *p;
6125 if (*p == '\n' || *p == '\r')
6126 j = 0;
6127 }
6128
6129 return (PyObject*) u;
6130}
6131
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006132PyDoc_STRVAR(find__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006133"S.find(sub [,start [,end]]) -> int\n\
6134\n\
6135Return the lowest index in S where substring sub is found,\n\
6136such that sub is contained within s[start,end]. Optional\n\
6137arguments start and end are interpreted as in slice notation.\n\
6138\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006139Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006140
6141static PyObject *
6142unicode_find(PyUnicodeObject *self, PyObject *args)
6143{
Thomas Wouters477c8d52006-05-27 19:21:47 +00006144 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006145 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006146 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006147 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006148
Guido van Rossumb8872e62000-05-09 14:14:27 +00006149 if (!PyArg_ParseTuple(args, "O|O&O&:find", &substring,
6150 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006151 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006152 substring = PyUnicode_FromObject(substring);
6153 if (!substring)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006154 return NULL;
6155
Thomas Wouters477c8d52006-05-27 19:21:47 +00006156 result = stringlib_find_slice(
6157 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
6158 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
6159 start, end
6160 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006161
6162 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00006163
6164 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006165}
6166
6167static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +00006168unicode_getitem(PyUnicodeObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006169{
6170 if (index < 0 || index >= self->length) {
6171 PyErr_SetString(PyExc_IndexError, "string index out of range");
6172 return NULL;
6173 }
6174
6175 return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1);
6176}
6177
6178static long
Guido van Rossum09dc34f2007-05-04 04:17:33 +00006179unicode_hash(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006180{
Guido van Rossum09dc34f2007-05-04 04:17:33 +00006181 /* Since Unicode objects compare equal to their UTF-8 string
6182 counterparts, we hash the UTF-8 string. */
6183 PyObject *v = _PyUnicode_AsDefaultEncodedString(self, NULL);
6184 return PyObject_Hash(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006185}
6186
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006187PyDoc_STRVAR(index__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006188"S.index(sub [,start [,end]]) -> int\n\
6189\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006190Like S.find() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006191
6192static PyObject *
6193unicode_index(PyUnicodeObject *self, PyObject *args)
6194{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006195 Py_ssize_t result;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006196 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006197 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006198 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006199
Guido van Rossumb8872e62000-05-09 14:14:27 +00006200 if (!PyArg_ParseTuple(args, "O|O&O&:index", &substring,
6201 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006202 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006203 substring = PyUnicode_FromObject(substring);
6204 if (!substring)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006205 return NULL;
6206
Thomas Wouters477c8d52006-05-27 19:21:47 +00006207 result = stringlib_find_slice(
6208 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
6209 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
6210 start, end
6211 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006212
6213 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00006214
Guido van Rossumd57fd912000-03-10 22:53:23 +00006215 if (result < 0) {
6216 PyErr_SetString(PyExc_ValueError, "substring not found");
6217 return NULL;
6218 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00006219
Martin v. Löwis18e16552006-02-15 17:27:45 +00006220 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006221}
6222
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006223PyDoc_STRVAR(islower__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006224"S.islower() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006225\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00006226Return True if all cased characters in S are lowercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006227at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006228
6229static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006230unicode_islower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006231{
6232 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6233 register const Py_UNICODE *e;
6234 int cased;
6235
Guido van Rossumd57fd912000-03-10 22:53:23 +00006236 /* Shortcut for single character strings */
6237 if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006238 return PyBool_FromLong(Py_UNICODE_ISLOWER(*p));
Guido van Rossumd57fd912000-03-10 22:53:23 +00006239
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006240 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006241 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006242 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006243
Guido van Rossumd57fd912000-03-10 22:53:23 +00006244 e = p + PyUnicode_GET_SIZE(self);
6245 cased = 0;
6246 for (; p < e; p++) {
6247 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00006248
Guido van Rossumd57fd912000-03-10 22:53:23 +00006249 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006250 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006251 else if (!cased && Py_UNICODE_ISLOWER(ch))
6252 cased = 1;
6253 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006254 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006255}
6256
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006257PyDoc_STRVAR(isupper__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006258"S.isupper() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006259\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006260Return True if all cased characters in S are uppercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006261at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006262
6263static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006264unicode_isupper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006265{
6266 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6267 register const Py_UNICODE *e;
6268 int cased;
6269
Guido van Rossumd57fd912000-03-10 22:53:23 +00006270 /* Shortcut for single character strings */
6271 if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006272 return PyBool_FromLong(Py_UNICODE_ISUPPER(*p) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006273
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006274 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006275 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006276 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006277
Guido van Rossumd57fd912000-03-10 22:53:23 +00006278 e = p + PyUnicode_GET_SIZE(self);
6279 cased = 0;
6280 for (; p < e; p++) {
6281 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00006282
Guido van Rossumd57fd912000-03-10 22:53:23 +00006283 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006284 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006285 else if (!cased && Py_UNICODE_ISUPPER(ch))
6286 cased = 1;
6287 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006288 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006289}
6290
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006291PyDoc_STRVAR(istitle__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006292"S.istitle() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006293\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006294Return True if S is a titlecased string and there is at least one\n\
6295character in S, i.e. upper- and titlecase characters may only\n\
6296follow uncased characters and lowercase characters only cased ones.\n\
6297Return False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006298
6299static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006300unicode_istitle(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006301{
6302 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6303 register const Py_UNICODE *e;
6304 int cased, previous_is_cased;
6305
Guido van Rossumd57fd912000-03-10 22:53:23 +00006306 /* Shortcut for single character strings */
6307 if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006308 return PyBool_FromLong((Py_UNICODE_ISTITLE(*p) != 0) ||
6309 (Py_UNICODE_ISUPPER(*p) != 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +00006310
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006311 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006312 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006313 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006314
Guido van Rossumd57fd912000-03-10 22:53:23 +00006315 e = p + PyUnicode_GET_SIZE(self);
6316 cased = 0;
6317 previous_is_cased = 0;
6318 for (; p < e; p++) {
6319 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00006320
Guido van Rossumd57fd912000-03-10 22:53:23 +00006321 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
6322 if (previous_is_cased)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006323 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006324 previous_is_cased = 1;
6325 cased = 1;
6326 }
6327 else if (Py_UNICODE_ISLOWER(ch)) {
6328 if (!previous_is_cased)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006329 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006330 previous_is_cased = 1;
6331 cased = 1;
6332 }
6333 else
6334 previous_is_cased = 0;
6335 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006336 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006337}
6338
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006339PyDoc_STRVAR(isspace__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006340"S.isspace() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006341\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006342Return True if all characters in S are whitespace\n\
6343and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006344
6345static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006346unicode_isspace(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006347{
6348 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6349 register const Py_UNICODE *e;
6350
Guido van Rossumd57fd912000-03-10 22:53:23 +00006351 /* Shortcut for single character strings */
6352 if (PyUnicode_GET_SIZE(self) == 1 &&
6353 Py_UNICODE_ISSPACE(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006354 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006355
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006356 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006357 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006358 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006359
Guido van Rossumd57fd912000-03-10 22:53:23 +00006360 e = p + PyUnicode_GET_SIZE(self);
6361 for (; p < e; p++) {
6362 if (!Py_UNICODE_ISSPACE(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006363 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006364 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006365 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006366}
6367
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006368PyDoc_STRVAR(isalpha__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006369"S.isalpha() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006370\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006371Return True if all characters in S are alphabetic\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006372and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006373
6374static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006375unicode_isalpha(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006376{
6377 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6378 register const Py_UNICODE *e;
6379
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006380 /* Shortcut for single character strings */
6381 if (PyUnicode_GET_SIZE(self) == 1 &&
6382 Py_UNICODE_ISALPHA(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006383 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006384
6385 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006386 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006387 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006388
6389 e = p + PyUnicode_GET_SIZE(self);
6390 for (; p < e; p++) {
6391 if (!Py_UNICODE_ISALPHA(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006392 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006393 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006394 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006395}
6396
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006397PyDoc_STRVAR(isalnum__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006398"S.isalnum() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006399\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006400Return True if all characters in S are alphanumeric\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006401and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006402
6403static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006404unicode_isalnum(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006405{
6406 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6407 register const Py_UNICODE *e;
6408
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006409 /* Shortcut for single character strings */
6410 if (PyUnicode_GET_SIZE(self) == 1 &&
6411 Py_UNICODE_ISALNUM(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006412 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006413
6414 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006415 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006416 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006417
6418 e = p + PyUnicode_GET_SIZE(self);
6419 for (; p < e; p++) {
6420 if (!Py_UNICODE_ISALNUM(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006421 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006422 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006423 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006424}
6425
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006426PyDoc_STRVAR(isdecimal__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006427"S.isdecimal() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006428\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00006429Return True if there are only decimal characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006430False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006431
6432static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006433unicode_isdecimal(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006434{
6435 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6436 register const Py_UNICODE *e;
6437
Guido van Rossumd57fd912000-03-10 22:53:23 +00006438 /* Shortcut for single character strings */
6439 if (PyUnicode_GET_SIZE(self) == 1 &&
6440 Py_UNICODE_ISDECIMAL(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006441 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006442
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006443 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006444 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006445 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006446
Guido van Rossumd57fd912000-03-10 22:53:23 +00006447 e = p + PyUnicode_GET_SIZE(self);
6448 for (; p < e; p++) {
6449 if (!Py_UNICODE_ISDECIMAL(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006450 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006451 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006452 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006453}
6454
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006455PyDoc_STRVAR(isdigit__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006456"S.isdigit() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006457\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006458Return True if all characters in S are digits\n\
6459and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006460
6461static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006462unicode_isdigit(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006463{
6464 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6465 register const Py_UNICODE *e;
6466
Guido van Rossumd57fd912000-03-10 22:53:23 +00006467 /* Shortcut for single character strings */
6468 if (PyUnicode_GET_SIZE(self) == 1 &&
6469 Py_UNICODE_ISDIGIT(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006470 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006471
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006472 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006473 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006474 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006475
Guido van Rossumd57fd912000-03-10 22:53:23 +00006476 e = p + PyUnicode_GET_SIZE(self);
6477 for (; p < e; p++) {
6478 if (!Py_UNICODE_ISDIGIT(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006479 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006480 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006481 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006482}
6483
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006484PyDoc_STRVAR(isnumeric__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006485"S.isnumeric() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006486\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00006487Return True if there are only numeric characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006488False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006489
6490static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006491unicode_isnumeric(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006492{
6493 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6494 register const Py_UNICODE *e;
6495
Guido van Rossumd57fd912000-03-10 22:53:23 +00006496 /* Shortcut for single character strings */
6497 if (PyUnicode_GET_SIZE(self) == 1 &&
6498 Py_UNICODE_ISNUMERIC(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006499 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006500
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006501 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006502 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006503 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006504
Guido van Rossumd57fd912000-03-10 22:53:23 +00006505 e = p + PyUnicode_GET_SIZE(self);
6506 for (; p < e; p++) {
6507 if (!Py_UNICODE_ISNUMERIC(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006508 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006509 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006510 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006511}
6512
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006513PyDoc_STRVAR(join__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006514"S.join(sequence) -> unicode\n\
6515\n\
6516Return a string which is the concatenation of the strings in the\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006517sequence. The separator between elements is S.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006518
6519static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006520unicode_join(PyObject *self, PyObject *data)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006521{
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006522 return PyUnicode_Join(self, data);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006523}
6524
Martin v. Löwis18e16552006-02-15 17:27:45 +00006525static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +00006526unicode_length(PyUnicodeObject *self)
6527{
6528 return self->length;
6529}
6530
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006531PyDoc_STRVAR(ljust__doc__,
Hye-Shik Chang974ed7c2004-06-02 16:49:17 +00006532"S.ljust(width[, fillchar]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006533\n\
6534Return S left justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006535done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006536
6537static PyObject *
6538unicode_ljust(PyUnicodeObject *self, PyObject *args)
6539{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006540 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006541 Py_UNICODE fillchar = ' ';
6542
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006543 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006544 return NULL;
6545
Tim Peters7a29bd52001-09-12 03:03:31 +00006546 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006547 Py_INCREF(self);
6548 return (PyObject*) self;
6549 }
6550
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006551 return (PyObject*) pad(self, 0, width - self->length, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006552}
6553
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006554PyDoc_STRVAR(lower__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006555"S.lower() -> unicode\n\
6556\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006557Return a copy of the string S converted to lowercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006558
6559static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006560unicode_lower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006561{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006562 return fixup(self, fixlower);
6563}
6564
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006565#define LEFTSTRIP 0
6566#define RIGHTSTRIP 1
6567#define BOTHSTRIP 2
6568
6569/* Arrays indexed by above */
6570static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
6571
6572#define STRIPNAME(i) (stripformat[i]+3)
6573
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006574/* externally visible for str.strip(unicode) */
6575PyObject *
6576_PyUnicode_XStrip(PyUnicodeObject *self, int striptype, PyObject *sepobj)
6577{
6578 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
Martin v. Löwis18e16552006-02-15 17:27:45 +00006579 Py_ssize_t len = PyUnicode_GET_SIZE(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006580 Py_UNICODE *sep = PyUnicode_AS_UNICODE(sepobj);
Martin v. Löwis18e16552006-02-15 17:27:45 +00006581 Py_ssize_t seplen = PyUnicode_GET_SIZE(sepobj);
6582 Py_ssize_t i, j;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006583
Thomas Wouters477c8d52006-05-27 19:21:47 +00006584 BLOOM_MASK sepmask = make_bloom_mask(sep, seplen);
6585
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006586 i = 0;
6587 if (striptype != RIGHTSTRIP) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00006588 while (i < len && BLOOM_MEMBER(sepmask, s[i], sep, seplen)) {
6589 i++;
6590 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006591 }
6592
6593 j = len;
6594 if (striptype != LEFTSTRIP) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00006595 do {
6596 j--;
6597 } while (j >= i && BLOOM_MEMBER(sepmask, s[j], sep, seplen));
6598 j++;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006599 }
6600
6601 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00006602 Py_INCREF(self);
6603 return (PyObject*)self;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006604 }
6605 else
Thomas Wouters477c8d52006-05-27 19:21:47 +00006606 return PyUnicode_FromUnicode(s+i, j-i);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006607}
6608
Guido van Rossumd57fd912000-03-10 22:53:23 +00006609
6610static PyObject *
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006611do_strip(PyUnicodeObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006612{
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006613 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
Martin v. Löwis18e16552006-02-15 17:27:45 +00006614 Py_ssize_t len = PyUnicode_GET_SIZE(self), i, j;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006615
6616 i = 0;
6617 if (striptype != RIGHTSTRIP) {
6618 while (i < len && Py_UNICODE_ISSPACE(s[i])) {
6619 i++;
6620 }
6621 }
6622
6623 j = len;
6624 if (striptype != LEFTSTRIP) {
6625 do {
6626 j--;
6627 } while (j >= i && Py_UNICODE_ISSPACE(s[j]));
6628 j++;
6629 }
6630
6631 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
6632 Py_INCREF(self);
6633 return (PyObject*)self;
6634 }
6635 else
6636 return PyUnicode_FromUnicode(s+i, j-i);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006637}
6638
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006639
6640static PyObject *
6641do_argstrip(PyUnicodeObject *self, int striptype, PyObject *args)
6642{
6643 PyObject *sep = NULL;
6644
6645 if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
6646 return NULL;
6647
6648 if (sep != NULL && sep != Py_None) {
6649 if (PyUnicode_Check(sep))
6650 return _PyUnicode_XStrip(self, striptype, sep);
6651 else if (PyString_Check(sep)) {
6652 PyObject *res;
6653 sep = PyUnicode_FromObject(sep);
6654 if (sep==NULL)
6655 return NULL;
6656 res = _PyUnicode_XStrip(self, striptype, sep);
6657 Py_DECREF(sep);
6658 return res;
6659 }
6660 else {
6661 PyErr_Format(PyExc_TypeError,
6662 "%s arg must be None, unicode or str",
6663 STRIPNAME(striptype));
6664 return NULL;
6665 }
6666 }
6667
6668 return do_strip(self, striptype);
6669}
6670
6671
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006672PyDoc_STRVAR(strip__doc__,
Neal Norwitzffe33b72003-04-10 22:35:32 +00006673"S.strip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006674\n\
6675Return a copy of the string S with leading and trailing\n\
6676whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00006677If chars is given and not None, remove characters in chars instead.\n\
6678If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006679
6680static PyObject *
6681unicode_strip(PyUnicodeObject *self, PyObject *args)
6682{
6683 if (PyTuple_GET_SIZE(args) == 0)
6684 return do_strip(self, BOTHSTRIP); /* Common case */
6685 else
6686 return do_argstrip(self, BOTHSTRIP, args);
6687}
6688
6689
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006690PyDoc_STRVAR(lstrip__doc__,
Neal Norwitzffe33b72003-04-10 22:35:32 +00006691"S.lstrip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006692\n\
6693Return a copy of the string S with leading whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00006694If chars is given and not None, remove characters in chars instead.\n\
6695If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006696
6697static PyObject *
6698unicode_lstrip(PyUnicodeObject *self, PyObject *args)
6699{
6700 if (PyTuple_GET_SIZE(args) == 0)
6701 return do_strip(self, LEFTSTRIP); /* Common case */
6702 else
6703 return do_argstrip(self, LEFTSTRIP, args);
6704}
6705
6706
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006707PyDoc_STRVAR(rstrip__doc__,
Neal Norwitzffe33b72003-04-10 22:35:32 +00006708"S.rstrip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006709\n\
6710Return a copy of the string S with trailing whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00006711If chars is given and not None, remove characters in chars instead.\n\
6712If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006713
6714static PyObject *
6715unicode_rstrip(PyUnicodeObject *self, PyObject *args)
6716{
6717 if (PyTuple_GET_SIZE(args) == 0)
6718 return do_strip(self, RIGHTSTRIP); /* Common case */
6719 else
6720 return do_argstrip(self, RIGHTSTRIP, args);
6721}
6722
6723
Guido van Rossumd57fd912000-03-10 22:53:23 +00006724static PyObject*
Martin v. Löwis18e16552006-02-15 17:27:45 +00006725unicode_repeat(PyUnicodeObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006726{
6727 PyUnicodeObject *u;
6728 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006729 Py_ssize_t nchars;
Tim Peters8f422462000-09-09 06:13:41 +00006730 size_t nbytes;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006731
6732 if (len < 0)
6733 len = 0;
6734
Tim Peters7a29bd52001-09-12 03:03:31 +00006735 if (len == 1 && PyUnicode_CheckExact(str)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006736 /* no repeat, return original string */
6737 Py_INCREF(str);
6738 return (PyObject*) str;
6739 }
Tim Peters8f422462000-09-09 06:13:41 +00006740
6741 /* ensure # of chars needed doesn't overflow int and # of bytes
6742 * needed doesn't overflow size_t
6743 */
6744 nchars = len * str->length;
6745 if (len && nchars / len != str->length) {
6746 PyErr_SetString(PyExc_OverflowError,
6747 "repeated string is too long");
6748 return NULL;
6749 }
6750 nbytes = (nchars + 1) * sizeof(Py_UNICODE);
6751 if (nbytes / sizeof(Py_UNICODE) != (size_t)(nchars + 1)) {
6752 PyErr_SetString(PyExc_OverflowError,
6753 "repeated string is too long");
6754 return NULL;
6755 }
6756 u = _PyUnicode_New(nchars);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006757 if (!u)
6758 return NULL;
6759
6760 p = u->str;
6761
Thomas Wouters477c8d52006-05-27 19:21:47 +00006762 if (str->length == 1 && len > 0) {
6763 Py_UNICODE_FILL(p, str->str[0], len);
6764 } else {
6765 Py_ssize_t done = 0; /* number of characters copied this far */
6766 if (done < nchars) {
6767 Py_UNICODE_COPY(p, str->str, str->length);
6768 done = str->length;
6769 }
6770 while (done < nchars) {
6771 int n = (done <= nchars-done) ? done : nchars-done;
6772 Py_UNICODE_COPY(p+done, p, n);
6773 done += n;
6774 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006775 }
6776
6777 return (PyObject*) u;
6778}
6779
6780PyObject *PyUnicode_Replace(PyObject *obj,
6781 PyObject *subobj,
6782 PyObject *replobj,
Martin v. Löwis18e16552006-02-15 17:27:45 +00006783 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006784{
6785 PyObject *self;
6786 PyObject *str1;
6787 PyObject *str2;
6788 PyObject *result;
6789
6790 self = PyUnicode_FromObject(obj);
6791 if (self == NULL)
6792 return NULL;
6793 str1 = PyUnicode_FromObject(subobj);
6794 if (str1 == NULL) {
6795 Py_DECREF(self);
6796 return NULL;
6797 }
6798 str2 = PyUnicode_FromObject(replobj);
6799 if (str2 == NULL) {
6800 Py_DECREF(self);
6801 Py_DECREF(str1);
6802 return NULL;
6803 }
Tim Petersced69f82003-09-16 20:30:58 +00006804 result = replace((PyUnicodeObject *)self,
6805 (PyUnicodeObject *)str1,
6806 (PyUnicodeObject *)str2,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006807 maxcount);
6808 Py_DECREF(self);
6809 Py_DECREF(str1);
6810 Py_DECREF(str2);
6811 return result;
6812}
6813
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006814PyDoc_STRVAR(replace__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006815"S.replace (old, new[, maxsplit]) -> unicode\n\
6816\n\
6817Return a copy of S with all occurrences of substring\n\
6818old replaced by new. If the optional argument maxsplit is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006819given, only the first maxsplit occurrences are replaced.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006820
6821static PyObject*
6822unicode_replace(PyUnicodeObject *self, PyObject *args)
6823{
6824 PyUnicodeObject *str1;
6825 PyUnicodeObject *str2;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006826 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006827 PyObject *result;
6828
Martin v. Löwis18e16552006-02-15 17:27:45 +00006829 if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006830 return NULL;
6831 str1 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str1);
6832 if (str1 == NULL)
6833 return NULL;
6834 str2 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str2);
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00006835 if (str2 == NULL) {
6836 Py_DECREF(str1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006837 return NULL;
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00006838 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006839
6840 result = replace(self, str1, str2, maxcount);
6841
6842 Py_DECREF(str1);
6843 Py_DECREF(str2);
6844 return result;
6845}
6846
6847static
6848PyObject *unicode_repr(PyObject *unicode)
6849{
Walter Dörwald79e913e2007-05-12 11:08:06 +00006850 PyObject *repr;
Walter Dörwald1ab83302007-05-18 17:15:44 +00006851 Py_UNICODE *p;
Walter Dörwald79e913e2007-05-12 11:08:06 +00006852 Py_UNICODE *s = PyUnicode_AS_UNICODE(unicode);
6853 Py_ssize_t size = PyUnicode_GET_SIZE(unicode);
6854
6855 /* XXX(nnorwitz): rather than over-allocating, it would be
6856 better to choose a different scheme. Perhaps scan the
6857 first N-chars of the string and allocate based on that size.
6858 */
6859 /* Initial allocation is based on the longest-possible unichr
6860 escape.
6861
6862 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
6863 unichr, so in this case it's the longest unichr escape. In
6864 narrow (UTF-16) builds this is five chars per source unichr
6865 since there are two unichrs in the surrogate pair, so in narrow
6866 (UTF-16) builds it's not the longest unichr escape.
6867
6868 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
6869 so in the narrow (UTF-16) build case it's the longest unichr
6870 escape.
6871 */
6872
Walter Dörwald1ab83302007-05-18 17:15:44 +00006873 repr = PyUnicode_FromUnicode(NULL,
Walter Dörwald79e913e2007-05-12 11:08:06 +00006874 2 /* quotes */
6875#ifdef Py_UNICODE_WIDE
6876 + 10*size
6877#else
6878 + 6*size
6879#endif
6880 + 1);
6881 if (repr == NULL)
6882 return NULL;
6883
Walter Dörwald1ab83302007-05-18 17:15:44 +00006884 p = PyUnicode_AS_UNICODE(repr);
Walter Dörwald79e913e2007-05-12 11:08:06 +00006885
6886 /* Add quote */
6887 *p++ = (findchar(s, size, '\'') &&
6888 !findchar(s, size, '"')) ? '"' : '\'';
6889 while (size-- > 0) {
6890 Py_UNICODE ch = *s++;
6891
6892 /* Escape quotes and backslashes */
Walter Dörwald1ab83302007-05-18 17:15:44 +00006893 if ((ch == PyUnicode_AS_UNICODE(repr)[0]) || (ch == '\\')) {
Walter Dörwald79e913e2007-05-12 11:08:06 +00006894 *p++ = '\\';
Walter Dörwald1ab83302007-05-18 17:15:44 +00006895 *p++ = ch;
Walter Dörwald79e913e2007-05-12 11:08:06 +00006896 continue;
6897 }
6898
6899#ifdef Py_UNICODE_WIDE
6900 /* Map 21-bit characters to '\U00xxxxxx' */
6901 else if (ch >= 0x10000) {
6902 *p++ = '\\';
6903 *p++ = 'U';
6904 *p++ = hexdigits[(ch >> 28) & 0x0000000F];
6905 *p++ = hexdigits[(ch >> 24) & 0x0000000F];
6906 *p++ = hexdigits[(ch >> 20) & 0x0000000F];
6907 *p++ = hexdigits[(ch >> 16) & 0x0000000F];
6908 *p++ = hexdigits[(ch >> 12) & 0x0000000F];
6909 *p++ = hexdigits[(ch >> 8) & 0x0000000F];
6910 *p++ = hexdigits[(ch >> 4) & 0x0000000F];
6911 *p++ = hexdigits[ch & 0x0000000F];
6912 continue;
6913 }
6914#else
6915 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
6916 else if (ch >= 0xD800 && ch < 0xDC00) {
6917 Py_UNICODE ch2;
6918 Py_UCS4 ucs;
6919
6920 ch2 = *s++;
6921 size--;
6922 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
6923 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
6924 *p++ = '\\';
6925 *p++ = 'U';
6926 *p++ = hexdigits[(ucs >> 28) & 0x0000000F];
6927 *p++ = hexdigits[(ucs >> 24) & 0x0000000F];
6928 *p++ = hexdigits[(ucs >> 20) & 0x0000000F];
6929 *p++ = hexdigits[(ucs >> 16) & 0x0000000F];
6930 *p++ = hexdigits[(ucs >> 12) & 0x0000000F];
6931 *p++ = hexdigits[(ucs >> 8) & 0x0000000F];
6932 *p++ = hexdigits[(ucs >> 4) & 0x0000000F];
6933 *p++ = hexdigits[ucs & 0x0000000F];
6934 continue;
6935 }
6936 /* Fall through: isolated surrogates are copied as-is */
6937 s--;
6938 size++;
6939 }
6940#endif
6941
6942 /* Map 16-bit characters to '\uxxxx' */
6943 if (ch >= 256) {
6944 *p++ = '\\';
6945 *p++ = 'u';
6946 *p++ = hexdigits[(ch >> 12) & 0x000F];
6947 *p++ = hexdigits[(ch >> 8) & 0x000F];
6948 *p++ = hexdigits[(ch >> 4) & 0x000F];
6949 *p++ = hexdigits[ch & 0x000F];
6950 }
6951
6952 /* Map special whitespace to '\t', \n', '\r' */
6953 else if (ch == '\t') {
6954 *p++ = '\\';
6955 *p++ = 't';
6956 }
6957 else if (ch == '\n') {
6958 *p++ = '\\';
6959 *p++ = 'n';
6960 }
6961 else if (ch == '\r') {
6962 *p++ = '\\';
6963 *p++ = 'r';
6964 }
6965
6966 /* Map non-printable US ASCII to '\xhh' */
6967 else if (ch < ' ' || ch >= 0x7F) {
6968 *p++ = '\\';
6969 *p++ = 'x';
6970 *p++ = hexdigits[(ch >> 4) & 0x000F];
6971 *p++ = hexdigits[ch & 0x000F];
6972 }
6973
6974 /* Copy everything else as-is */
6975 else
6976 *p++ = (char) ch;
6977 }
6978 /* Add quote */
Walter Dörwald1ab83302007-05-18 17:15:44 +00006979 *p++ = PyUnicode_AS_UNICODE(repr)[0];
Walter Dörwald79e913e2007-05-12 11:08:06 +00006980
6981 *p = '\0';
Walter Dörwald1ab83302007-05-18 17:15:44 +00006982 _PyUnicode_Resize(&repr, p - PyUnicode_AS_UNICODE(repr));
Walter Dörwald79e913e2007-05-12 11:08:06 +00006983 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006984}
6985
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006986PyDoc_STRVAR(rfind__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006987"S.rfind(sub [,start [,end]]) -> int\n\
6988\n\
6989Return the highest index in S where substring sub is found,\n\
6990such that sub is contained within s[start,end]. Optional\n\
6991arguments start and end are interpreted as in slice notation.\n\
6992\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006993Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006994
6995static PyObject *
6996unicode_rfind(PyUnicodeObject *self, PyObject *args)
6997{
Thomas Wouters477c8d52006-05-27 19:21:47 +00006998 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006999 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007000 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007001 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007002
Guido van Rossumb8872e62000-05-09 14:14:27 +00007003 if (!PyArg_ParseTuple(args, "O|O&O&:rfind", &substring,
7004 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007005 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007006 substring = PyUnicode_FromObject(substring);
7007 if (!substring)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007008 return NULL;
7009
Thomas Wouters477c8d52006-05-27 19:21:47 +00007010 result = stringlib_rfind_slice(
7011 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
7012 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
7013 start, end
7014 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007015
7016 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00007017
7018 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007019}
7020
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007021PyDoc_STRVAR(rindex__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007022"S.rindex(sub [,start [,end]]) -> int\n\
7023\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007024Like S.rfind() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007025
7026static PyObject *
7027unicode_rindex(PyUnicodeObject *self, PyObject *args)
7028{
Thomas Wouters477c8d52006-05-27 19:21:47 +00007029 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007030 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007031 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007032 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007033
Guido van Rossumb8872e62000-05-09 14:14:27 +00007034 if (!PyArg_ParseTuple(args, "O|O&O&:rindex", &substring,
7035 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007036 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007037 substring = PyUnicode_FromObject(substring);
7038 if (!substring)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007039 return NULL;
7040
Thomas Wouters477c8d52006-05-27 19:21:47 +00007041 result = stringlib_rfind_slice(
7042 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
7043 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
7044 start, end
7045 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007046
7047 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00007048
Guido van Rossumd57fd912000-03-10 22:53:23 +00007049 if (result < 0) {
7050 PyErr_SetString(PyExc_ValueError, "substring not found");
7051 return NULL;
7052 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00007053 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007054}
7055
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007056PyDoc_STRVAR(rjust__doc__,
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007057"S.rjust(width[, fillchar]) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007058\n\
7059Return S right justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007060done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007061
7062static PyObject *
7063unicode_rjust(PyUnicodeObject *self, PyObject *args)
7064{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007065 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007066 Py_UNICODE fillchar = ' ';
7067
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007068 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007069 return NULL;
7070
Tim Peters7a29bd52001-09-12 03:03:31 +00007071 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007072 Py_INCREF(self);
7073 return (PyObject*) self;
7074 }
7075
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007076 return (PyObject*) pad(self, width - self->length, 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007077}
7078
Guido van Rossumd57fd912000-03-10 22:53:23 +00007079static PyObject*
Martin v. Löwis18e16552006-02-15 17:27:45 +00007080unicode_slice(PyUnicodeObject *self, Py_ssize_t start, Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007081{
7082 /* standard clamping */
7083 if (start < 0)
7084 start = 0;
7085 if (end < 0)
7086 end = 0;
7087 if (end > self->length)
7088 end = self->length;
Tim Peters7a29bd52001-09-12 03:03:31 +00007089 if (start == 0 && end == self->length && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007090 /* full slice, return original string */
7091 Py_INCREF(self);
7092 return (PyObject*) self;
7093 }
7094 if (start > end)
7095 start = end;
7096 /* copy slice */
7097 return (PyObject*) PyUnicode_FromUnicode(self->str + start,
7098 end - start);
7099}
7100
7101PyObject *PyUnicode_Split(PyObject *s,
7102 PyObject *sep,
Martin v. Löwis18e16552006-02-15 17:27:45 +00007103 Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007104{
7105 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00007106
Guido van Rossumd57fd912000-03-10 22:53:23 +00007107 s = PyUnicode_FromObject(s);
7108 if (s == NULL)
7109 return NULL;
7110 if (sep != NULL) {
7111 sep = PyUnicode_FromObject(sep);
7112 if (sep == NULL) {
7113 Py_DECREF(s);
7114 return NULL;
7115 }
7116 }
7117
7118 result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
7119
7120 Py_DECREF(s);
7121 Py_XDECREF(sep);
7122 return result;
7123}
7124
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007125PyDoc_STRVAR(split__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007126"S.split([sep [,maxsplit]]) -> list of strings\n\
7127\n\
7128Return a list of the words in S, using sep as the\n\
7129delimiter string. If maxsplit is given, at most maxsplit\n\
Thomas Hellerca0d2cb2004-09-15 11:41:32 +00007130splits are done. If sep is not specified or is None,\n\
Walter Dörwald782afc52004-09-14 09:40:45 +00007131any whitespace string is a separator.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007132
7133static PyObject*
7134unicode_split(PyUnicodeObject *self, PyObject *args)
7135{
7136 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007137 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007138
Martin v. Löwis18e16552006-02-15 17:27:45 +00007139 if (!PyArg_ParseTuple(args, "|On:split", &substring, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007140 return NULL;
7141
7142 if (substring == Py_None)
7143 return split(self, NULL, maxcount);
7144 else if (PyUnicode_Check(substring))
7145 return split(self, (PyUnicodeObject *)substring, maxcount);
7146 else
7147 return PyUnicode_Split((PyObject *)self, substring, maxcount);
7148}
7149
Thomas Wouters477c8d52006-05-27 19:21:47 +00007150PyObject *
7151PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
7152{
7153 PyObject* str_obj;
7154 PyObject* sep_obj;
7155 PyObject* out;
7156
7157 str_obj = PyUnicode_FromObject(str_in);
7158 if (!str_obj)
7159 return NULL;
7160 sep_obj = PyUnicode_FromObject(sep_in);
7161 if (!sep_obj) {
7162 Py_DECREF(str_obj);
7163 return NULL;
7164 }
7165
7166 out = stringlib_partition(
7167 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
7168 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
7169 );
7170
7171 Py_DECREF(sep_obj);
7172 Py_DECREF(str_obj);
7173
7174 return out;
7175}
7176
7177
7178PyObject *
7179PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
7180{
7181 PyObject* str_obj;
7182 PyObject* sep_obj;
7183 PyObject* out;
7184
7185 str_obj = PyUnicode_FromObject(str_in);
7186 if (!str_obj)
7187 return NULL;
7188 sep_obj = PyUnicode_FromObject(sep_in);
7189 if (!sep_obj) {
7190 Py_DECREF(str_obj);
7191 return NULL;
7192 }
7193
7194 out = stringlib_rpartition(
7195 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
7196 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
7197 );
7198
7199 Py_DECREF(sep_obj);
7200 Py_DECREF(str_obj);
7201
7202 return out;
7203}
7204
7205PyDoc_STRVAR(partition__doc__,
7206"S.partition(sep) -> (head, sep, tail)\n\
7207\n\
7208Searches for the separator sep in S, and returns the part before it,\n\
7209the separator itself, and the part after it. If the separator is not\n\
7210found, returns S and two empty strings.");
7211
7212static PyObject*
7213unicode_partition(PyUnicodeObject *self, PyObject *separator)
7214{
7215 return PyUnicode_Partition((PyObject *)self, separator);
7216}
7217
7218PyDoc_STRVAR(rpartition__doc__,
Thomas Wouters89f507f2006-12-13 04:49:30 +00007219"S.rpartition(sep) -> (tail, sep, head)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00007220\n\
7221Searches for the separator sep in S, starting at the end of S, and returns\n\
7222the part before it, the separator itself, and the part after it. If the\n\
Thomas Wouters89f507f2006-12-13 04:49:30 +00007223separator is not found, returns two empty strings and S.");
Thomas Wouters477c8d52006-05-27 19:21:47 +00007224
7225static PyObject*
7226unicode_rpartition(PyUnicodeObject *self, PyObject *separator)
7227{
7228 return PyUnicode_RPartition((PyObject *)self, separator);
7229}
7230
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007231PyObject *PyUnicode_RSplit(PyObject *s,
7232 PyObject *sep,
Martin v. Löwis18e16552006-02-15 17:27:45 +00007233 Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007234{
7235 PyObject *result;
7236
7237 s = PyUnicode_FromObject(s);
7238 if (s == NULL)
7239 return NULL;
7240 if (sep != NULL) {
7241 sep = PyUnicode_FromObject(sep);
7242 if (sep == NULL) {
7243 Py_DECREF(s);
7244 return NULL;
7245 }
7246 }
7247
7248 result = rsplit((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
7249
7250 Py_DECREF(s);
7251 Py_XDECREF(sep);
7252 return result;
7253}
7254
7255PyDoc_STRVAR(rsplit__doc__,
7256"S.rsplit([sep [,maxsplit]]) -> list of strings\n\
7257\n\
7258Return a list of the words in S, using sep as the\n\
7259delimiter string, starting at the end of the string and\n\
7260working to the front. If maxsplit is given, at most maxsplit\n\
7261splits are done. If sep is not specified, any whitespace string\n\
7262is a separator.");
7263
7264static PyObject*
7265unicode_rsplit(PyUnicodeObject *self, PyObject *args)
7266{
7267 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007268 Py_ssize_t maxcount = -1;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007269
Martin v. Löwis18e16552006-02-15 17:27:45 +00007270 if (!PyArg_ParseTuple(args, "|On:rsplit", &substring, &maxcount))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007271 return NULL;
7272
7273 if (substring == Py_None)
7274 return rsplit(self, NULL, maxcount);
7275 else if (PyUnicode_Check(substring))
7276 return rsplit(self, (PyUnicodeObject *)substring, maxcount);
7277 else
7278 return PyUnicode_RSplit((PyObject *)self, substring, maxcount);
7279}
7280
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007281PyDoc_STRVAR(splitlines__doc__,
Guido van Rossum86662912000-04-11 15:38:46 +00007282"S.splitlines([keepends]]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007283\n\
7284Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +00007285Line breaks are not included in the resulting list unless keepends\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007286is given and true.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007287
7288static PyObject*
7289unicode_splitlines(PyUnicodeObject *self, PyObject *args)
7290{
Guido van Rossum86662912000-04-11 15:38:46 +00007291 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007292
Guido van Rossum86662912000-04-11 15:38:46 +00007293 if (!PyArg_ParseTuple(args, "|i:splitlines", &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007294 return NULL;
7295
Guido van Rossum86662912000-04-11 15:38:46 +00007296 return PyUnicode_Splitlines((PyObject *)self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007297}
7298
7299static
Guido van Rossumf15a29f2007-05-04 00:41:39 +00007300PyObject *unicode_str(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007301{
Guido van Rossumf15a29f2007-05-04 00:41:39 +00007302 PyObject *res = _PyUnicode_AsDefaultEncodedString(self, NULL);
7303 Py_XINCREF(res);
7304 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007305}
7306
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007307PyDoc_STRVAR(swapcase__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007308"S.swapcase() -> unicode\n\
7309\n\
7310Return a copy of S with uppercase characters converted to lowercase\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007311and vice versa.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007312
7313static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007314unicode_swapcase(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007315{
Guido van Rossumd57fd912000-03-10 22:53:23 +00007316 return fixup(self, fixswapcase);
7317}
7318
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007319PyDoc_STRVAR(translate__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007320"S.translate(table) -> unicode\n\
7321\n\
7322Return a copy of the string S, where all characters have been mapped\n\
7323through the given translation table, which must be a mapping of\n\
Walter Dörwald5c1ee172002-09-04 20:31:32 +00007324Unicode ordinals to Unicode ordinals, Unicode strings or None.\n\
7325Unmapped characters are left untouched. Characters mapped to None\n\
7326are deleted.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007327
7328static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007329unicode_translate(PyUnicodeObject *self, PyObject *table)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007330{
Tim Petersced69f82003-09-16 20:30:58 +00007331 return PyUnicode_TranslateCharmap(self->str,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007332 self->length,
Tim Petersced69f82003-09-16 20:30:58 +00007333 table,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007334 "ignore");
7335}
7336
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007337PyDoc_STRVAR(upper__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007338"S.upper() -> unicode\n\
7339\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007340Return a copy of S converted to uppercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007341
7342static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007343unicode_upper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007344{
Guido van Rossumd57fd912000-03-10 22:53:23 +00007345 return fixup(self, fixupper);
7346}
7347
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007348PyDoc_STRVAR(zfill__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007349"S.zfill(width) -> unicode\n\
7350\n\
7351Pad a numeric string x with zeros on the left, to fill a field\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007352of the specified width. The string x is never truncated.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007353
7354static PyObject *
7355unicode_zfill(PyUnicodeObject *self, PyObject *args)
7356{
Martin v. Löwis18e16552006-02-15 17:27:45 +00007357 Py_ssize_t fill;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007358 PyUnicodeObject *u;
7359
Martin v. Löwis18e16552006-02-15 17:27:45 +00007360 Py_ssize_t width;
7361 if (!PyArg_ParseTuple(args, "n:zfill", &width))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007362 return NULL;
7363
7364 if (self->length >= width) {
Walter Dörwald0fe940c2002-04-15 18:42:15 +00007365 if (PyUnicode_CheckExact(self)) {
7366 Py_INCREF(self);
7367 return (PyObject*) self;
7368 }
7369 else
7370 return PyUnicode_FromUnicode(
7371 PyUnicode_AS_UNICODE(self),
7372 PyUnicode_GET_SIZE(self)
7373 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007374 }
7375
7376 fill = width - self->length;
7377
7378 u = pad(self, fill, 0, '0');
7379
Walter Dörwald068325e2002-04-15 13:36:47 +00007380 if (u == NULL)
7381 return NULL;
7382
Guido van Rossumd57fd912000-03-10 22:53:23 +00007383 if (u->str[fill] == '+' || u->str[fill] == '-') {
7384 /* move sign to beginning of string */
7385 u->str[0] = u->str[fill];
7386 u->str[fill] = '0';
7387 }
7388
7389 return (PyObject*) u;
7390}
Guido van Rossumd57fd912000-03-10 22:53:23 +00007391
7392#if 0
7393static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007394unicode_freelistsize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007395{
Guido van Rossumd57fd912000-03-10 22:53:23 +00007396 return PyInt_FromLong(unicode_freelist_size);
7397}
7398#endif
7399
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007400PyDoc_STRVAR(startswith__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00007401"S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007402\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00007403Return True if S starts with the specified prefix, False otherwise.\n\
7404With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007405With optional end, stop comparing S at that position.\n\
7406prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007407
7408static PyObject *
7409unicode_startswith(PyUnicodeObject *self,
7410 PyObject *args)
7411{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007412 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007413 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007414 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007415 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007416 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007417
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007418 if (!PyArg_ParseTuple(args, "O|O&O&:startswith", &subobj,
Guido van Rossumb8872e62000-05-09 14:14:27 +00007419 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007420 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007421 if (PyTuple_Check(subobj)) {
7422 Py_ssize_t i;
7423 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
7424 substring = (PyUnicodeObject *)PyUnicode_FromObject(
7425 PyTuple_GET_ITEM(subobj, i));
7426 if (substring == NULL)
7427 return NULL;
7428 result = tailmatch(self, substring, start, end, -1);
7429 Py_DECREF(substring);
7430 if (result) {
7431 Py_RETURN_TRUE;
7432 }
7433 }
7434 /* nothing matched */
7435 Py_RETURN_FALSE;
7436 }
7437 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007438 if (substring == NULL)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007439 return NULL;
7440 result = tailmatch(self, substring, start, end, -1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007441 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007442 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007443}
7444
7445
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007446PyDoc_STRVAR(endswith__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00007447"S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007448\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00007449Return True if S ends with the specified suffix, False otherwise.\n\
7450With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007451With optional end, stop comparing S at that position.\n\
7452suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007453
7454static PyObject *
7455unicode_endswith(PyUnicodeObject *self,
7456 PyObject *args)
7457{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007458 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007459 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007460 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007461 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007462 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007463
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007464 if (!PyArg_ParseTuple(args, "O|O&O&:endswith", &subobj,
7465 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007466 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007467 if (PyTuple_Check(subobj)) {
7468 Py_ssize_t i;
7469 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
7470 substring = (PyUnicodeObject *)PyUnicode_FromObject(
7471 PyTuple_GET_ITEM(subobj, i));
7472 if (substring == NULL)
7473 return NULL;
7474 result = tailmatch(self, substring, start, end, +1);
7475 Py_DECREF(substring);
7476 if (result) {
7477 Py_RETURN_TRUE;
7478 }
7479 }
7480 Py_RETURN_FALSE;
7481 }
7482 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007483 if (substring == NULL)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007484 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007485
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007486 result = tailmatch(self, substring, start, end, +1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007487 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007488 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007489}
7490
7491
Guido van Rossum5d9113d2003-01-29 17:58:45 +00007492
7493static PyObject *
7494unicode_getnewargs(PyUnicodeObject *v)
7495{
7496 return Py_BuildValue("(u#)", v->str, v->length);
7497}
7498
7499
Guido van Rossumd57fd912000-03-10 22:53:23 +00007500static PyMethodDef unicode_methods[] = {
7501
7502 /* Order is according to common usage: often used methods should
7503 appear first, since lookup is done sequentially. */
7504
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007505 {"encode", (PyCFunction) unicode_encode, METH_VARARGS, encode__doc__},
7506 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
7507 {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007508 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007509 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
7510 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
7511 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
7512 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
7513 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
7514 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
7515 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +00007516 {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007517 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
7518 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
7519 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007520 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00007521 {"decode", (PyCFunction) unicode_decode, METH_VARARGS, decode__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007522/* {"maketrans", (PyCFunction) unicode_maketrans, METH_VARARGS, maketrans__doc__}, */
7523 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
7524 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
7525 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007526 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +00007527 {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007528 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS, splitlines__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007529 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007530 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
7531 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
7532 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
7533 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
7534 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
7535 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
7536 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
7537 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
7538 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
7539 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
7540 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
7541 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
7542 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
7543 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007544 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
Walter Dörwald068325e2002-04-15 13:36:47 +00007545#if 0
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007546 {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
Guido van Rossumd57fd912000-03-10 22:53:23 +00007547#endif
7548
7549#if 0
7550 /* This one is just used for debugging the implementation. */
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007551 {"freelistsize", (PyCFunction) unicode_freelistsize, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00007552#endif
7553
Guido van Rossum5d9113d2003-01-29 17:58:45 +00007554 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00007555 {NULL, NULL}
7556};
7557
Neil Schemenauerce30bc92002-11-18 16:10:18 +00007558static PyObject *
7559unicode_mod(PyObject *v, PyObject *w)
7560{
7561 if (!PyUnicode_Check(v)) {
7562 Py_INCREF(Py_NotImplemented);
7563 return Py_NotImplemented;
7564 }
7565 return PyUnicode_Format(v, w);
7566}
7567
7568static PyNumberMethods unicode_as_number = {
7569 0, /*nb_add*/
7570 0, /*nb_subtract*/
7571 0, /*nb_multiply*/
Neil Schemenauerce30bc92002-11-18 16:10:18 +00007572 unicode_mod, /*nb_remainder*/
7573};
7574
Guido van Rossumd57fd912000-03-10 22:53:23 +00007575static PySequenceMethods unicode_as_sequence = {
Martin v. Löwis18e16552006-02-15 17:27:45 +00007576 (lenfunc) unicode_length, /* sq_length */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007577 PyUnicode_Concat, /* sq_concat */
Martin v. Löwis18e16552006-02-15 17:27:45 +00007578 (ssizeargfunc) unicode_repeat, /* sq_repeat */
7579 (ssizeargfunc) unicode_getitem, /* sq_item */
7580 (ssizessizeargfunc) unicode_slice, /* sq_slice */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007581 0, /* sq_ass_item */
7582 0, /* sq_ass_slice */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007583 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007584};
7585
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007586static PyObject*
7587unicode_subscript(PyUnicodeObject* self, PyObject* item)
7588{
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00007589 if (PyIndex_Check(item)) {
7590 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007591 if (i == -1 && PyErr_Occurred())
7592 return NULL;
7593 if (i < 0)
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007594 i += PyUnicode_GET_SIZE(self);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007595 return unicode_getitem(self, i);
7596 } else if (PySlice_Check(item)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00007597 Py_ssize_t start, stop, step, slicelength, cur, i;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007598 Py_UNICODE* source_buf;
7599 Py_UNICODE* result_buf;
7600 PyObject* result;
7601
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007602 if (PySlice_GetIndicesEx((PySliceObject*)item, PyUnicode_GET_SIZE(self),
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007603 &start, &stop, &step, &slicelength) < 0) {
7604 return NULL;
7605 }
7606
7607 if (slicelength <= 0) {
7608 return PyUnicode_FromUnicode(NULL, 0);
7609 } else {
7610 source_buf = PyUnicode_AS_UNICODE((PyObject*)self);
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007611 result_buf = (Py_UNICODE *)PyMem_MALLOC(slicelength*
7612 sizeof(Py_UNICODE));
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007613
7614 if (result_buf == NULL)
7615 return PyErr_NoMemory();
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007616
7617 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
7618 result_buf[i] = source_buf[cur];
7619 }
Tim Petersced69f82003-09-16 20:30:58 +00007620
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007621 result = PyUnicode_FromUnicode(result_buf, slicelength);
7622 PyMem_FREE(result_buf);
7623 return result;
7624 }
7625 } else {
7626 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
7627 return NULL;
7628 }
7629}
7630
7631static PyMappingMethods unicode_as_mapping = {
Martin v. Löwis18e16552006-02-15 17:27:45 +00007632 (lenfunc)unicode_length, /* mp_length */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007633 (binaryfunc)unicode_subscript, /* mp_subscript */
7634 (objobjargproc)0, /* mp_ass_subscript */
7635};
7636
Martin v. Löwis18e16552006-02-15 17:27:45 +00007637static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +00007638unicode_buffer_getreadbuf(PyUnicodeObject *self,
Martin v. Löwis18e16552006-02-15 17:27:45 +00007639 Py_ssize_t index,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007640 const void **ptr)
7641{
7642 if (index != 0) {
7643 PyErr_SetString(PyExc_SystemError,
7644 "accessing non-existent unicode segment");
7645 return -1;
7646 }
7647 *ptr = (void *) self->str;
7648 return PyUnicode_GET_DATA_SIZE(self);
7649}
7650
Martin v. Löwis18e16552006-02-15 17:27:45 +00007651static Py_ssize_t
7652unicode_buffer_getwritebuf(PyUnicodeObject *self, Py_ssize_t index,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007653 const void **ptr)
7654{
7655 PyErr_SetString(PyExc_TypeError,
Neal Norwitz20e72132002-06-13 21:25:17 +00007656 "cannot use unicode as modifiable buffer");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007657 return -1;
7658}
7659
7660static int
7661unicode_buffer_getsegcount(PyUnicodeObject *self,
Martin v. Löwis18e16552006-02-15 17:27:45 +00007662 Py_ssize_t *lenp)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007663{
7664 if (lenp)
7665 *lenp = PyUnicode_GET_DATA_SIZE(self);
7666 return 1;
7667}
7668
Martin v. Löwiseb079f12006-02-16 14:32:27 +00007669static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +00007670unicode_buffer_getcharbuf(PyUnicodeObject *self,
Martin v. Löwis18e16552006-02-15 17:27:45 +00007671 Py_ssize_t index,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007672 const void **ptr)
7673{
7674 PyObject *str;
Tim Petersced69f82003-09-16 20:30:58 +00007675
Guido van Rossumd57fd912000-03-10 22:53:23 +00007676 if (index != 0) {
7677 PyErr_SetString(PyExc_SystemError,
7678 "accessing non-existent unicode segment");
7679 return -1;
7680 }
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00007681 str = _PyUnicode_AsDefaultEncodedString((PyObject *)self, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007682 if (str == NULL)
7683 return -1;
7684 *ptr = (void *) PyString_AS_STRING(str);
7685 return PyString_GET_SIZE(str);
7686}
7687
7688/* Helpers for PyUnicode_Format() */
7689
7690static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +00007691getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007692{
Martin v. Löwis18e16552006-02-15 17:27:45 +00007693 Py_ssize_t argidx = *p_argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007694 if (argidx < arglen) {
7695 (*p_argidx)++;
7696 if (arglen < 0)
7697 return args;
7698 else
7699 return PyTuple_GetItem(args, argidx);
7700 }
7701 PyErr_SetString(PyExc_TypeError,
7702 "not enough arguments for format string");
7703 return NULL;
7704}
7705
7706#define F_LJUST (1<<0)
7707#define F_SIGN (1<<1)
7708#define F_BLANK (1<<2)
7709#define F_ALT (1<<3)
7710#define F_ZERO (1<<4)
7711
Martin v. Löwis18e16552006-02-15 17:27:45 +00007712static Py_ssize_t
Neal Norwitzfc76d632006-01-10 06:03:13 +00007713strtounicode(Py_UNICODE *buffer, const char *charbuffer)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007714{
Martin v. Löwis18e16552006-02-15 17:27:45 +00007715 register Py_ssize_t i;
7716 Py_ssize_t len = strlen(charbuffer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007717 for (i = len - 1; i >= 0; i--)
7718 buffer[i] = (Py_UNICODE) charbuffer[i];
7719
Guido van Rossumd57fd912000-03-10 22:53:23 +00007720 return len;
7721}
7722
Neal Norwitzfc76d632006-01-10 06:03:13 +00007723static int
7724doubletounicode(Py_UNICODE *buffer, size_t len, const char *format, double x)
7725{
Tim Peters15231542006-02-16 01:08:01 +00007726 Py_ssize_t result;
7727
Neal Norwitzfc76d632006-01-10 06:03:13 +00007728 PyOS_ascii_formatd((char *)buffer, len, format, x);
Tim Peters15231542006-02-16 01:08:01 +00007729 result = strtounicode(buffer, (char *)buffer);
7730 return Py_SAFE_DOWNCAST(result, Py_ssize_t, int);
Neal Norwitzfc76d632006-01-10 06:03:13 +00007731}
7732
7733static int
7734longtounicode(Py_UNICODE *buffer, size_t len, const char *format, long x)
7735{
Tim Peters15231542006-02-16 01:08:01 +00007736 Py_ssize_t result;
7737
Neal Norwitzfc76d632006-01-10 06:03:13 +00007738 PyOS_snprintf((char *)buffer, len, format, x);
Tim Peters15231542006-02-16 01:08:01 +00007739 result = strtounicode(buffer, (char *)buffer);
7740 return Py_SAFE_DOWNCAST(result, Py_ssize_t, int);
Neal Norwitzfc76d632006-01-10 06:03:13 +00007741}
7742
Guido van Rossum078151d2002-08-11 04:24:12 +00007743/* XXX To save some code duplication, formatfloat/long/int could have been
7744 shared with stringobject.c, converting from 8-bit to Unicode after the
7745 formatting is done. */
7746
Guido van Rossumd57fd912000-03-10 22:53:23 +00007747static int
7748formatfloat(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007749 size_t buflen,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007750 int flags,
7751 int prec,
7752 int type,
7753 PyObject *v)
7754{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007755 /* fmt = '%#.' + `prec` + `type`
7756 worst case length = 3 + 10 (len of INT_MAX) + 1 = 14 (use 20)*/
Guido van Rossumd57fd912000-03-10 22:53:23 +00007757 char fmt[20];
7758 double x;
Tim Petersced69f82003-09-16 20:30:58 +00007759
Guido van Rossumd57fd912000-03-10 22:53:23 +00007760 x = PyFloat_AsDouble(v);
7761 if (x == -1.0 && PyErr_Occurred())
7762 return -1;
7763 if (prec < 0)
7764 prec = 6;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007765 if (type == 'f' && (fabs(x) / 1e25) >= 1e25)
7766 type = 'g';
Marc-André Lemburg79f57832002-12-29 19:44:06 +00007767 /* Worst case length calc to ensure no buffer overrun:
7768
7769 'g' formats:
7770 fmt = %#.<prec>g
7771 buf = '-' + [0-9]*prec + '.' + 'e+' + (longest exp
7772 for any double rep.)
7773 len = 1 + prec + 1 + 2 + 5 = 9 + prec
7774
7775 'f' formats:
7776 buf = '-' + [0-9]*x + '.' + [0-9]*prec (with x < 50)
7777 len = 1 + 50 + 1 + prec = 52 + prec
7778
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007779 If prec=0 the effective precision is 1 (the leading digit is
Tim Petersced69f82003-09-16 20:30:58 +00007780 always given), therefore increase the length by one.
Marc-André Lemburg79f57832002-12-29 19:44:06 +00007781
7782 */
7783 if ((type == 'g' && buflen <= (size_t)10 + (size_t)prec) ||
7784 (type == 'f' && buflen <= (size_t)53 + (size_t)prec)) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007785 PyErr_SetString(PyExc_OverflowError,
Marc-André Lemburg79f57832002-12-29 19:44:06 +00007786 "formatted float is too long (precision too large?)");
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007787 return -1;
7788 }
Marc-André Lemburg79f57832002-12-29 19:44:06 +00007789 PyOS_snprintf(fmt, sizeof(fmt), "%%%s.%d%c",
7790 (flags&F_ALT) ? "#" : "",
7791 prec, type);
Neal Norwitzfc76d632006-01-10 06:03:13 +00007792 return doubletounicode(buf, buflen, fmt, x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007793}
7794
Tim Peters38fd5b62000-09-21 05:43:11 +00007795static PyObject*
7796formatlong(PyObject *val, int flags, int prec, int type)
7797{
7798 char *buf;
7799 int i, len;
7800 PyObject *str; /* temporary string object. */
7801 PyUnicodeObject *result;
7802
7803 str = _PyString_FormatLong(val, flags, prec, type, &buf, &len);
7804 if (!str)
7805 return NULL;
7806 result = _PyUnicode_New(len);
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00007807 if (!result) {
7808 Py_DECREF(str);
7809 return NULL;
7810 }
Tim Peters38fd5b62000-09-21 05:43:11 +00007811 for (i = 0; i < len; i++)
7812 result->str[i] = buf[i];
7813 result->str[len] = 0;
7814 Py_DECREF(str);
7815 return (PyObject*)result;
7816}
7817
Guido van Rossumd57fd912000-03-10 22:53:23 +00007818static int
7819formatint(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007820 size_t buflen,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007821 int flags,
7822 int prec,
7823 int type,
7824 PyObject *v)
7825{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007826 /* fmt = '%#.' + `prec` + 'l' + `type`
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00007827 * worst case length = 3 + 19 (worst len of INT_MAX on 64-bit machine)
7828 * + 1 + 1
7829 * = 24
7830 */
Tim Peters38fd5b62000-09-21 05:43:11 +00007831 char fmt[64]; /* plenty big enough! */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00007832 char *sign;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007833 long x;
7834
7835 x = PyInt_AsLong(v);
7836 if (x == -1 && PyErr_Occurred())
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00007837 return -1;
Guido van Rossum6c9e1302003-11-29 23:52:13 +00007838 if (x < 0 && type == 'u') {
7839 type = 'd';
Guido van Rossum078151d2002-08-11 04:24:12 +00007840 }
Guido van Rossum6c9e1302003-11-29 23:52:13 +00007841 if (x < 0 && (type == 'x' || type == 'X' || type == 'o'))
7842 sign = "-";
7843 else
7844 sign = "";
Guido van Rossumd57fd912000-03-10 22:53:23 +00007845 if (prec < 0)
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00007846 prec = 1;
7847
Guido van Rossum6c9e1302003-11-29 23:52:13 +00007848 /* buf = '+'/'-'/'' + '0'/'0x'/'' + '[0-9]'*max(prec, len(x in octal))
7849 * worst case buf = '-0x' + [0-9]*prec, where prec >= 11
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00007850 */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00007851 if (buflen <= 14 || buflen <= (size_t)3 + (size_t)prec) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007852 PyErr_SetString(PyExc_OverflowError,
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00007853 "formatted integer is too long (precision too large?)");
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007854 return -1;
7855 }
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00007856
7857 if ((flags & F_ALT) &&
7858 (type == 'x' || type == 'X')) {
Tim Petersced69f82003-09-16 20:30:58 +00007859 /* When converting under %#x or %#X, there are a number
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00007860 * of issues that cause pain:
7861 * - when 0 is being converted, the C standard leaves off
7862 * the '0x' or '0X', which is inconsistent with other
7863 * %#x/%#X conversions and inconsistent with Python's
7864 * hex() function
7865 * - there are platforms that violate the standard and
7866 * convert 0 with the '0x' or '0X'
7867 * (Metrowerks, Compaq Tru64)
7868 * - there are platforms that give '0x' when converting
Tim Petersced69f82003-09-16 20:30:58 +00007869 * under %#X, but convert 0 in accordance with the
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00007870 * standard (OS/2 EMX)
Tim Petersced69f82003-09-16 20:30:58 +00007871 *
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00007872 * We can achieve the desired consistency by inserting our
7873 * own '0x' or '0X' prefix, and substituting %x/%X in place
7874 * of %#x/%#X.
7875 *
7876 * Note that this is the same approach as used in
7877 * formatint() in stringobject.c
Andrew MacIntyrec4874392002-02-26 11:36:35 +00007878 */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00007879 PyOS_snprintf(fmt, sizeof(fmt), "%s0%c%%.%dl%c",
7880 sign, type, prec, type);
Andrew MacIntyrec4874392002-02-26 11:36:35 +00007881 }
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00007882 else {
Guido van Rossum6c9e1302003-11-29 23:52:13 +00007883 PyOS_snprintf(fmt, sizeof(fmt), "%s%%%s.%dl%c",
7884 sign, (flags&F_ALT) ? "#" : "",
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00007885 prec, type);
Tim Petersb3d8d1f2001-04-28 05:38:26 +00007886 }
Guido van Rossum6c9e1302003-11-29 23:52:13 +00007887 if (sign[0])
Neal Norwitzfc76d632006-01-10 06:03:13 +00007888 return longtounicode(buf, buflen, fmt, -x);
Guido van Rossum6c9e1302003-11-29 23:52:13 +00007889 else
Neal Norwitzfc76d632006-01-10 06:03:13 +00007890 return longtounicode(buf, buflen, fmt, x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007891}
7892
7893static int
7894formatchar(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007895 size_t buflen,
7896 PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007897{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007898 /* presume that the buffer is at least 2 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00007899 if (PyUnicode_Check(v)) {
7900 if (PyUnicode_GET_SIZE(v) != 1)
7901 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007902 buf[0] = PyUnicode_AS_UNICODE(v)[0];
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00007903 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007904
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00007905 else if (PyString_Check(v)) {
Tim Petersced69f82003-09-16 20:30:58 +00007906 if (PyString_GET_SIZE(v) != 1)
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00007907 goto onError;
7908 buf[0] = (Py_UNICODE)PyString_AS_STRING(v)[0];
7909 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007910
7911 else {
7912 /* Integer input truncated to a character */
7913 long x;
7914 x = PyInt_AsLong(v);
7915 if (x == -1 && PyErr_Occurred())
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00007916 goto onError;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00007917#ifdef Py_UNICODE_WIDE
7918 if (x < 0 || x > 0x10ffff) {
Walter Dörwald44f527f2003-04-02 16:37:24 +00007919 PyErr_SetString(PyExc_OverflowError,
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00007920 "%c arg not in range(0x110000) "
7921 "(wide Python build)");
7922 return -1;
7923 }
7924#else
7925 if (x < 0 || x > 0xffff) {
Walter Dörwald44f527f2003-04-02 16:37:24 +00007926 PyErr_SetString(PyExc_OverflowError,
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00007927 "%c arg not in range(0x10000) "
7928 "(narrow Python build)");
7929 return -1;
7930 }
7931#endif
7932 buf[0] = (Py_UNICODE) x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007933 }
7934 buf[1] = '\0';
7935 return 1;
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00007936
7937 onError:
7938 PyErr_SetString(PyExc_TypeError,
7939 "%c requires int or char");
7940 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007941}
7942
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007943/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
7944
7945 FORMATBUFLEN is the length of the buffer in which the floats, ints, &
7946 chars are formatted. XXX This is a magic number. Each formatting
7947 routine does bounds checking to ensure no overflow, but a better
7948 solution may be to malloc a buffer of appropriate size for each
7949 format. For now, the current solution is sufficient.
7950*/
7951#define FORMATBUFLEN (size_t)120
7952
Guido van Rossumd57fd912000-03-10 22:53:23 +00007953PyObject *PyUnicode_Format(PyObject *format,
7954 PyObject *args)
7955{
7956 Py_UNICODE *fmt, *res;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007957 Py_ssize_t fmtcnt, rescnt, reslen, arglen, argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007958 int args_owned = 0;
7959 PyUnicodeObject *result = NULL;
7960 PyObject *dict = NULL;
7961 PyObject *uformat;
Tim Petersced69f82003-09-16 20:30:58 +00007962
Guido van Rossumd57fd912000-03-10 22:53:23 +00007963 if (format == NULL || args == NULL) {
7964 PyErr_BadInternalCall();
7965 return NULL;
7966 }
7967 uformat = PyUnicode_FromObject(format);
Fred Drakee4315f52000-05-09 19:53:39 +00007968 if (uformat == NULL)
7969 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007970 fmt = PyUnicode_AS_UNICODE(uformat);
7971 fmtcnt = PyUnicode_GET_SIZE(uformat);
7972
7973 reslen = rescnt = fmtcnt + 100;
7974 result = _PyUnicode_New(reslen);
7975 if (result == NULL)
7976 goto onError;
7977 res = PyUnicode_AS_UNICODE(result);
7978
7979 if (PyTuple_Check(args)) {
7980 arglen = PyTuple_Size(args);
7981 argidx = 0;
7982 }
7983 else {
7984 arglen = -1;
7985 argidx = -2;
7986 }
Neal Norwitz80a1bf42002-11-12 23:01:12 +00007987 if (args->ob_type->tp_as_mapping && !PyTuple_Check(args) &&
7988 !PyObject_TypeCheck(args, &PyBaseString_Type))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007989 dict = args;
7990
7991 while (--fmtcnt >= 0) {
7992 if (*fmt != '%') {
7993 if (--rescnt < 0) {
7994 rescnt = fmtcnt + 100;
7995 reslen += rescnt;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00007996 if (_PyUnicode_Resize(&result, reslen) < 0)
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00007997 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007998 res = PyUnicode_AS_UNICODE(result) + reslen - rescnt;
7999 --rescnt;
8000 }
8001 *res++ = *fmt++;
8002 }
8003 else {
8004 /* Got a format specifier */
8005 int flags = 0;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008006 Py_ssize_t width = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008007 int prec = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008008 Py_UNICODE c = '\0';
8009 Py_UNICODE fill;
8010 PyObject *v = NULL;
8011 PyObject *temp = NULL;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008012 Py_UNICODE *pbuf;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008013 Py_UNICODE sign;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008014 Py_ssize_t len;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008015 Py_UNICODE formatbuf[FORMATBUFLEN]; /* For format{float,int,char}() */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008016
8017 fmt++;
8018 if (*fmt == '(') {
8019 Py_UNICODE *keystart;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008020 Py_ssize_t keylen;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008021 PyObject *key;
8022 int pcount = 1;
8023
8024 if (dict == NULL) {
8025 PyErr_SetString(PyExc_TypeError,
Tim Petersced69f82003-09-16 20:30:58 +00008026 "format requires a mapping");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008027 goto onError;
8028 }
8029 ++fmt;
8030 --fmtcnt;
8031 keystart = fmt;
8032 /* Skip over balanced parentheses */
8033 while (pcount > 0 && --fmtcnt >= 0) {
8034 if (*fmt == ')')
8035 --pcount;
8036 else if (*fmt == '(')
8037 ++pcount;
8038 fmt++;
8039 }
8040 keylen = fmt - keystart - 1;
8041 if (fmtcnt < 0 || pcount > 0) {
8042 PyErr_SetString(PyExc_ValueError,
8043 "incomplete format key");
8044 goto onError;
8045 }
Marc-André Lemburg72f82132001-11-20 15:18:49 +00008046#if 0
Fred Drakee4315f52000-05-09 19:53:39 +00008047 /* keys are converted to strings using UTF-8 and
Guido van Rossumd57fd912000-03-10 22:53:23 +00008048 then looked up since Python uses strings to hold
8049 variables names etc. in its namespaces and we
Fred Drakee4315f52000-05-09 19:53:39 +00008050 wouldn't want to break common idioms. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008051 key = PyUnicode_EncodeUTF8(keystart,
8052 keylen,
8053 NULL);
Marc-André Lemburg72f82132001-11-20 15:18:49 +00008054#else
8055 key = PyUnicode_FromUnicode(keystart, keylen);
8056#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00008057 if (key == NULL)
8058 goto onError;
8059 if (args_owned) {
8060 Py_DECREF(args);
8061 args_owned = 0;
8062 }
8063 args = PyObject_GetItem(dict, key);
8064 Py_DECREF(key);
8065 if (args == NULL) {
8066 goto onError;
8067 }
8068 args_owned = 1;
8069 arglen = -1;
8070 argidx = -2;
8071 }
8072 while (--fmtcnt >= 0) {
8073 switch (c = *fmt++) {
8074 case '-': flags |= F_LJUST; continue;
8075 case '+': flags |= F_SIGN; continue;
8076 case ' ': flags |= F_BLANK; continue;
8077 case '#': flags |= F_ALT; continue;
8078 case '0': flags |= F_ZERO; continue;
8079 }
8080 break;
8081 }
8082 if (c == '*') {
8083 v = getnextarg(args, arglen, &argidx);
8084 if (v == NULL)
8085 goto onError;
8086 if (!PyInt_Check(v)) {
8087 PyErr_SetString(PyExc_TypeError,
8088 "* wants int");
8089 goto onError;
8090 }
8091 width = PyInt_AsLong(v);
Guido van Rossumddefaf32007-01-14 03:31:43 +00008092 if (width == -1 && PyErr_Occurred())
8093 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008094 if (width < 0) {
8095 flags |= F_LJUST;
8096 width = -width;
8097 }
8098 if (--fmtcnt >= 0)
8099 c = *fmt++;
8100 }
8101 else if (c >= '0' && c <= '9') {
8102 width = c - '0';
8103 while (--fmtcnt >= 0) {
8104 c = *fmt++;
8105 if (c < '0' || c > '9')
8106 break;
8107 if ((width*10) / 10 != width) {
8108 PyErr_SetString(PyExc_ValueError,
8109 "width too big");
8110 goto onError;
8111 }
8112 width = width*10 + (c - '0');
8113 }
8114 }
8115 if (c == '.') {
8116 prec = 0;
8117 if (--fmtcnt >= 0)
8118 c = *fmt++;
8119 if (c == '*') {
8120 v = getnextarg(args, arglen, &argidx);
8121 if (v == NULL)
8122 goto onError;
8123 if (!PyInt_Check(v)) {
8124 PyErr_SetString(PyExc_TypeError,
8125 "* wants int");
8126 goto onError;
8127 }
8128 prec = PyInt_AsLong(v);
Guido van Rossumddefaf32007-01-14 03:31:43 +00008129 if (prec == -1 && PyErr_Occurred())
8130 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008131 if (prec < 0)
8132 prec = 0;
8133 if (--fmtcnt >= 0)
8134 c = *fmt++;
8135 }
8136 else if (c >= '0' && c <= '9') {
8137 prec = c - '0';
8138 while (--fmtcnt >= 0) {
8139 c = Py_CHARMASK(*fmt++);
8140 if (c < '0' || c > '9')
8141 break;
8142 if ((prec*10) / 10 != prec) {
8143 PyErr_SetString(PyExc_ValueError,
8144 "prec too big");
8145 goto onError;
8146 }
8147 prec = prec*10 + (c - '0');
8148 }
8149 }
8150 } /* prec */
8151 if (fmtcnt >= 0) {
8152 if (c == 'h' || c == 'l' || c == 'L') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00008153 if (--fmtcnt >= 0)
8154 c = *fmt++;
8155 }
8156 }
8157 if (fmtcnt < 0) {
8158 PyErr_SetString(PyExc_ValueError,
8159 "incomplete format");
8160 goto onError;
8161 }
8162 if (c != '%') {
8163 v = getnextarg(args, arglen, &argidx);
8164 if (v == NULL)
8165 goto onError;
8166 }
8167 sign = 0;
8168 fill = ' ';
8169 switch (c) {
8170
8171 case '%':
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008172 pbuf = formatbuf;
8173 /* presume that buffer length is at least 1 */
8174 pbuf[0] = '%';
Guido van Rossumd57fd912000-03-10 22:53:23 +00008175 len = 1;
8176 break;
8177
8178 case 's':
8179 case 'r':
8180 if (PyUnicode_Check(v) && c == 's') {
8181 temp = v;
8182 Py_INCREF(temp);
8183 }
8184 else {
8185 PyObject *unicode;
8186 if (c == 's')
Marc-André Lemburgd25c6502004-07-23 16:13:25 +00008187 temp = PyObject_Unicode(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008188 else
8189 temp = PyObject_Repr(v);
8190 if (temp == NULL)
8191 goto onError;
Marc-André Lemburgd25c6502004-07-23 16:13:25 +00008192 if (PyUnicode_Check(temp))
8193 /* nothing to do */;
8194 else if (PyString_Check(temp)) {
8195 /* convert to string to Unicode */
Thomas Woutersa96affe2006-03-12 00:29:36 +00008196 unicode = PyUnicode_Decode(PyString_AS_STRING(temp),
Guido van Rossumd57fd912000-03-10 22:53:23 +00008197 PyString_GET_SIZE(temp),
Thomas Woutersa96affe2006-03-12 00:29:36 +00008198 NULL,
Guido van Rossumd57fd912000-03-10 22:53:23 +00008199 "strict");
Thomas Woutersa96affe2006-03-12 00:29:36 +00008200 Py_DECREF(temp);
8201 temp = unicode;
8202 if (temp == NULL)
8203 goto onError;
8204 }
Marc-André Lemburgd25c6502004-07-23 16:13:25 +00008205 else {
8206 Py_DECREF(temp);
8207 PyErr_SetString(PyExc_TypeError,
8208 "%s argument has non-string str()");
8209 goto onError;
8210 }
8211 }
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008212 pbuf = PyUnicode_AS_UNICODE(temp);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008213 len = PyUnicode_GET_SIZE(temp);
8214 if (prec >= 0 && len > prec)
8215 len = prec;
8216 break;
8217
8218 case 'i':
8219 case 'd':
8220 case 'u':
8221 case 'o':
8222 case 'x':
8223 case 'X':
8224 if (c == 'i')
8225 c = 'd';
Tim Petersa3a3a032000-11-30 05:22:44 +00008226 if (PyLong_Check(v)) {
Tim Peters38fd5b62000-09-21 05:43:11 +00008227 temp = formatlong(v, flags, prec, c);
8228 if (!temp)
8229 goto onError;
8230 pbuf = PyUnicode_AS_UNICODE(temp);
8231 len = PyUnicode_GET_SIZE(temp);
Tim Peters38fd5b62000-09-21 05:43:11 +00008232 sign = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008233 }
Tim Peters38fd5b62000-09-21 05:43:11 +00008234 else {
8235 pbuf = formatbuf;
8236 len = formatint(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
8237 flags, prec, c, v);
8238 if (len < 0)
8239 goto onError;
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008240 sign = 1;
Tim Peters38fd5b62000-09-21 05:43:11 +00008241 }
8242 if (flags & F_ZERO)
8243 fill = '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00008244 break;
8245
8246 case 'e':
8247 case 'E':
8248 case 'f':
Raymond Hettinger9bfe5332003-08-27 04:55:52 +00008249 case 'F':
Guido van Rossumd57fd912000-03-10 22:53:23 +00008250 case 'g':
8251 case 'G':
Raymond Hettinger9bfe5332003-08-27 04:55:52 +00008252 if (c == 'F')
8253 c = 'f';
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008254 pbuf = formatbuf;
8255 len = formatfloat(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
8256 flags, prec, c, v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008257 if (len < 0)
8258 goto onError;
8259 sign = 1;
Tim Peters38fd5b62000-09-21 05:43:11 +00008260 if (flags & F_ZERO)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008261 fill = '0';
8262 break;
8263
8264 case 'c':
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008265 pbuf = formatbuf;
8266 len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008267 if (len < 0)
8268 goto onError;
8269 break;
8270
8271 default:
8272 PyErr_Format(PyExc_ValueError,
Andrew M. Kuchling6ca89172000-12-15 13:07:46 +00008273 "unsupported format character '%c' (0x%x) "
Thomas Wouters89f507f2006-12-13 04:49:30 +00008274 "at index %zd",
Tim Petersced69f82003-09-16 20:30:58 +00008275 (31<=c && c<=126) ? (char)c : '?',
Marc-André Lemburg24e53b62002-09-24 09:32:14 +00008276 (int)c,
Thomas Wouters89f507f2006-12-13 04:49:30 +00008277 (Py_ssize_t)(fmt - 1 -
8278 PyUnicode_AS_UNICODE(uformat)));
Guido van Rossumd57fd912000-03-10 22:53:23 +00008279 goto onError;
8280 }
8281 if (sign) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008282 if (*pbuf == '-' || *pbuf == '+') {
8283 sign = *pbuf++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008284 len--;
8285 }
8286 else if (flags & F_SIGN)
8287 sign = '+';
8288 else if (flags & F_BLANK)
8289 sign = ' ';
8290 else
8291 sign = 0;
8292 }
8293 if (width < len)
8294 width = len;
Guido van Rossum049cd6b2002-10-11 00:43:48 +00008295 if (rescnt - (sign != 0) < width) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00008296 reslen -= rescnt;
8297 rescnt = width + fmtcnt + 100;
8298 reslen += rescnt;
Guido van Rossum049cd6b2002-10-11 00:43:48 +00008299 if (reslen < 0) {
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00008300 Py_XDECREF(temp);
Thomas Woutersa96affe2006-03-12 00:29:36 +00008301 PyErr_NoMemory();
8302 goto onError;
Guido van Rossum049cd6b2002-10-11 00:43:48 +00008303 }
Thomas Woutersa96affe2006-03-12 00:29:36 +00008304 if (_PyUnicode_Resize(&result, reslen) < 0) {
8305 Py_XDECREF(temp);
8306 goto onError;
8307 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008308 res = PyUnicode_AS_UNICODE(result)
8309 + reslen - rescnt;
8310 }
8311 if (sign) {
8312 if (fill != ' ')
8313 *res++ = sign;
8314 rescnt--;
8315 if (width > len)
8316 width--;
8317 }
Tim Peters38fd5b62000-09-21 05:43:11 +00008318 if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
8319 assert(pbuf[0] == '0');
Tim Petersfff53252001-04-12 18:38:48 +00008320 assert(pbuf[1] == c);
8321 if (fill != ' ') {
8322 *res++ = *pbuf++;
8323 *res++ = *pbuf++;
Tim Peters38fd5b62000-09-21 05:43:11 +00008324 }
Tim Petersfff53252001-04-12 18:38:48 +00008325 rescnt -= 2;
8326 width -= 2;
8327 if (width < 0)
8328 width = 0;
8329 len -= 2;
Tim Peters38fd5b62000-09-21 05:43:11 +00008330 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008331 if (width > len && !(flags & F_LJUST)) {
8332 do {
8333 --rescnt;
8334 *res++ = fill;
8335 } while (--width > len);
8336 }
Tim Peters38fd5b62000-09-21 05:43:11 +00008337 if (fill == ' ') {
8338 if (sign)
8339 *res++ = sign;
Tim Petersfff53252001-04-12 18:38:48 +00008340 if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
Tim Peters38fd5b62000-09-21 05:43:11 +00008341 assert(pbuf[0] == '0');
Tim Petersfff53252001-04-12 18:38:48 +00008342 assert(pbuf[1] == c);
Tim Peters38fd5b62000-09-21 05:43:11 +00008343 *res++ = *pbuf++;
8344 *res++ = *pbuf++;
8345 }
8346 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008347 Py_UNICODE_COPY(res, pbuf, len);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008348 res += len;
8349 rescnt -= len;
8350 while (--width >= len) {
8351 --rescnt;
8352 *res++ = ' ';
8353 }
8354 if (dict && (argidx < arglen) && c != '%') {
8355 PyErr_SetString(PyExc_TypeError,
Raymond Hettinger0ebac972002-05-21 15:14:57 +00008356 "not all arguments converted during string formatting");
Thomas Woutersa96affe2006-03-12 00:29:36 +00008357 Py_XDECREF(temp);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008358 goto onError;
8359 }
8360 Py_XDECREF(temp);
8361 } /* '%' */
8362 } /* until end */
8363 if (argidx < arglen && !dict) {
8364 PyErr_SetString(PyExc_TypeError,
Raymond Hettinger0ebac972002-05-21 15:14:57 +00008365 "not all arguments converted during string formatting");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008366 goto onError;
8367 }
8368
Thomas Woutersa96affe2006-03-12 00:29:36 +00008369 if (_PyUnicode_Resize(&result, reslen - rescnt) < 0)
8370 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008371 if (args_owned) {
8372 Py_DECREF(args);
8373 }
8374 Py_DECREF(uformat);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008375 return (PyObject *)result;
8376
8377 onError:
8378 Py_XDECREF(result);
8379 Py_DECREF(uformat);
8380 if (args_owned) {
8381 Py_DECREF(args);
8382 }
8383 return NULL;
8384}
8385
8386static PyBufferProcs unicode_as_buffer = {
Martin v. Löwis18e16552006-02-15 17:27:45 +00008387 (readbufferproc) unicode_buffer_getreadbuf,
8388 (writebufferproc) unicode_buffer_getwritebuf,
8389 (segcountproc) unicode_buffer_getsegcount,
8390 (charbufferproc) unicode_buffer_getcharbuf,
Guido van Rossumd57fd912000-03-10 22:53:23 +00008391};
8392
Jeremy Hylton938ace62002-07-17 16:30:39 +00008393static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +00008394unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
8395
Tim Peters6d6c1a32001-08-02 04:15:00 +00008396static PyObject *
8397unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
8398{
8399 PyObject *x = NULL;
Martin v. Löwis15e62742006-02-27 16:46:16 +00008400 static char *kwlist[] = {"string", "encoding", "errors", 0};
Tim Peters6d6c1a32001-08-02 04:15:00 +00008401 char *encoding = NULL;
8402 char *errors = NULL;
8403
Guido van Rossume023fe02001-08-30 03:12:59 +00008404 if (type != &PyUnicode_Type)
8405 return unicode_subtype_new(type, args, kwds);
Tim Peters6d6c1a32001-08-02 04:15:00 +00008406 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:unicode",
8407 kwlist, &x, &encoding, &errors))
8408 return NULL;
8409 if (x == NULL)
8410 return (PyObject *)_PyUnicode_New(0);
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00008411 if (encoding == NULL && errors == NULL)
8412 return PyObject_Unicode(x);
8413 else
Tim Peters6d6c1a32001-08-02 04:15:00 +00008414 return PyUnicode_FromEncodedObject(x, encoding, errors);
8415}
8416
Guido van Rossume023fe02001-08-30 03:12:59 +00008417static PyObject *
8418unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
8419{
Tim Petersaf90b3e2001-09-12 05:18:58 +00008420 PyUnicodeObject *tmp, *pnew;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008421 Py_ssize_t n;
Guido van Rossume023fe02001-08-30 03:12:59 +00008422
8423 assert(PyType_IsSubtype(type, &PyUnicode_Type));
8424 tmp = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds);
8425 if (tmp == NULL)
8426 return NULL;
8427 assert(PyUnicode_Check(tmp));
Tim Petersaf90b3e2001-09-12 05:18:58 +00008428 pnew = (PyUnicodeObject *) type->tp_alloc(type, n = tmp->length);
Raymond Hettingerf4667932003-06-28 20:04:25 +00008429 if (pnew == NULL) {
8430 Py_DECREF(tmp);
Guido van Rossume023fe02001-08-30 03:12:59 +00008431 return NULL;
Raymond Hettingerf4667932003-06-28 20:04:25 +00008432 }
Tim Petersaf90b3e2001-09-12 05:18:58 +00008433 pnew->str = PyMem_NEW(Py_UNICODE, n+1);
8434 if (pnew->str == NULL) {
8435 _Py_ForgetReference((PyObject *)pnew);
Neil Schemenauer58aa8612002-04-12 03:07:20 +00008436 PyObject_Del(pnew);
Raymond Hettingerf4667932003-06-28 20:04:25 +00008437 Py_DECREF(tmp);
Neal Norwitzec74f2f2003-02-11 23:05:40 +00008438 return PyErr_NoMemory();
Guido van Rossume023fe02001-08-30 03:12:59 +00008439 }
Tim Petersaf90b3e2001-09-12 05:18:58 +00008440 Py_UNICODE_COPY(pnew->str, tmp->str, n+1);
8441 pnew->length = n;
8442 pnew->hash = tmp->hash;
Guido van Rossume023fe02001-08-30 03:12:59 +00008443 Py_DECREF(tmp);
Tim Petersaf90b3e2001-09-12 05:18:58 +00008444 return (PyObject *)pnew;
Guido van Rossume023fe02001-08-30 03:12:59 +00008445}
8446
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008447PyDoc_STRVAR(unicode_doc,
Tim Peters6d6c1a32001-08-02 04:15:00 +00008448"unicode(string [, encoding[, errors]]) -> object\n\
8449\n\
8450Create a new Unicode object from the given encoded string.\n\
Skip Montanaro35b37a52002-07-26 16:22:46 +00008451encoding defaults to the current default string encoding.\n\
8452errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +00008453
Guido van Rossum50e9fb92006-08-17 05:42:55 +00008454static PyObject *unicode_iter(PyObject *seq);
8455
Guido van Rossumd57fd912000-03-10 22:53:23 +00008456PyTypeObject PyUnicode_Type = {
8457 PyObject_HEAD_INIT(&PyType_Type)
8458 0, /* ob_size */
Guido van Rossum84fc66d2007-05-03 17:18:26 +00008459 "str", /* tp_name */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008460 sizeof(PyUnicodeObject), /* tp_size */
8461 0, /* tp_itemsize */
8462 /* Slots */
Guido van Rossum9475a232001-10-05 20:51:39 +00008463 (destructor)unicode_dealloc, /* tp_dealloc */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008464 0, /* tp_print */
Tim Peters6d6c1a32001-08-02 04:15:00 +00008465 0, /* tp_getattr */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008466 0, /* tp_setattr */
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00008467 0, /* tp_compare */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008468 unicode_repr, /* tp_repr */
Neil Schemenauerce30bc92002-11-18 16:10:18 +00008469 &unicode_as_number, /* tp_as_number */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008470 &unicode_as_sequence, /* tp_as_sequence */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008471 &unicode_as_mapping, /* tp_as_mapping */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008472 (hashfunc) unicode_hash, /* tp_hash*/
8473 0, /* tp_call*/
8474 (reprfunc) unicode_str, /* tp_str */
Tim Peters6d6c1a32001-08-02 04:15:00 +00008475 PyObject_GenericGetAttr, /* tp_getattro */
8476 0, /* tp_setattro */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008477 &unicode_as_buffer, /* tp_as_buffer */
Thomas Wouters27d517b2007-02-25 20:39:11 +00008478 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
8479 Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
Tim Peters6d6c1a32001-08-02 04:15:00 +00008480 unicode_doc, /* tp_doc */
8481 0, /* tp_traverse */
8482 0, /* tp_clear */
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00008483 PyUnicode_RichCompare, /* tp_richcompare */
Tim Peters6d6c1a32001-08-02 04:15:00 +00008484 0, /* tp_weaklistoffset */
Guido van Rossum50e9fb92006-08-17 05:42:55 +00008485 unicode_iter, /* tp_iter */
Tim Peters6d6c1a32001-08-02 04:15:00 +00008486 0, /* tp_iternext */
8487 unicode_methods, /* tp_methods */
8488 0, /* tp_members */
8489 0, /* tp_getset */
Guido van Rossumcacfc072002-05-24 19:01:59 +00008490 &PyBaseString_Type, /* tp_base */
Tim Peters6d6c1a32001-08-02 04:15:00 +00008491 0, /* tp_dict */
8492 0, /* tp_descr_get */
8493 0, /* tp_descr_set */
8494 0, /* tp_dictoffset */
8495 0, /* tp_init */
8496 0, /* tp_alloc */
8497 unicode_new, /* tp_new */
Neil Schemenauer58aa8612002-04-12 03:07:20 +00008498 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008499};
8500
8501/* Initialize the Unicode implementation */
8502
Thomas Wouters78890102000-07-22 19:25:51 +00008503void _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008504{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008505 int i;
8506
Thomas Wouters477c8d52006-05-27 19:21:47 +00008507 /* XXX - move this array to unicodectype.c ? */
8508 Py_UNICODE linebreak[] = {
8509 0x000A, /* LINE FEED */
8510 0x000D, /* CARRIAGE RETURN */
8511 0x001C, /* FILE SEPARATOR */
8512 0x001D, /* GROUP SEPARATOR */
8513 0x001E, /* RECORD SEPARATOR */
8514 0x0085, /* NEXT LINE */
8515 0x2028, /* LINE SEPARATOR */
8516 0x2029, /* PARAGRAPH SEPARATOR */
8517 };
8518
Fred Drakee4315f52000-05-09 19:53:39 +00008519 /* Init the implementation */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008520 unicode_freelist = NULL;
8521 unicode_freelist_size = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008522 unicode_empty = _PyUnicode_New(0);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008523 if (!unicode_empty)
8524 return;
8525
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008526 for (i = 0; i < 256; i++)
8527 unicode_latin1[i] = NULL;
Guido van Rossumcacfc072002-05-24 19:01:59 +00008528 if (PyType_Ready(&PyUnicode_Type) < 0)
8529 Py_FatalError("Can't initialize 'unicode'");
Thomas Wouters477c8d52006-05-27 19:21:47 +00008530
8531 /* initialize the linebreak bloom filter */
8532 bloom_linebreak = make_bloom_mask(
8533 linebreak, sizeof(linebreak) / sizeof(linebreak[0])
8534 );
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008535
8536 PyType_Ready(&EncodingMapType);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008537}
8538
8539/* Finalize the Unicode implementation */
8540
8541void
Thomas Wouters78890102000-07-22 19:25:51 +00008542_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008543{
Barry Warsaw5b4c2282000-10-03 20:45:26 +00008544 PyUnicodeObject *u;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008545 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008546
Guido van Rossum4ae8ef82000-10-03 18:09:04 +00008547 Py_XDECREF(unicode_empty);
8548 unicode_empty = NULL;
Barry Warsaw5b4c2282000-10-03 20:45:26 +00008549
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008550 for (i = 0; i < 256; i++) {
8551 if (unicode_latin1[i]) {
8552 Py_DECREF(unicode_latin1[i]);
8553 unicode_latin1[i] = NULL;
8554 }
8555 }
8556
Barry Warsaw5b4c2282000-10-03 20:45:26 +00008557 for (u = unicode_freelist; u != NULL;) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00008558 PyUnicodeObject *v = u;
8559 u = *(PyUnicodeObject **)u;
Guido van Rossumfd4b9572000-04-10 13:51:10 +00008560 if (v->str)
Guido van Rossumb18618d2000-05-03 23:44:39 +00008561 PyMem_DEL(v->str);
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00008562 Py_XDECREF(v->defenc);
Neil Schemenauer58aa8612002-04-12 03:07:20 +00008563 PyObject_Del(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008564 }
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008565 unicode_freelist = NULL;
8566 unicode_freelist_size = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008567}
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00008568
Guido van Rossum50e9fb92006-08-17 05:42:55 +00008569
8570
8571/********************* Unicode Iterator **************************/
8572
8573typedef struct {
8574 PyObject_HEAD
Guido van Rossum49d6b072006-08-17 21:11:47 +00008575 Py_ssize_t it_index;
Guido van Rossum50e9fb92006-08-17 05:42:55 +00008576 PyUnicodeObject *it_seq; /* Set to NULL when iterator is exhausted */
8577} unicodeiterobject;
8578
8579static void
8580unicodeiter_dealloc(unicodeiterobject *it)
8581{
8582 _PyObject_GC_UNTRACK(it);
8583 Py_XDECREF(it->it_seq);
8584 PyObject_GC_Del(it);
8585}
8586
8587static int
8588unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
8589{
8590 Py_VISIT(it->it_seq);
8591 return 0;
8592}
8593
8594static PyObject *
8595unicodeiter_next(unicodeiterobject *it)
8596{
8597 PyUnicodeObject *seq;
8598 PyObject *item;
8599
8600 assert(it != NULL);
8601 seq = it->it_seq;
8602 if (seq == NULL)
8603 return NULL;
8604 assert(PyUnicode_Check(seq));
8605
8606 if (it->it_index < PyUnicode_GET_SIZE(seq)) {
Guido van Rossum49d6b072006-08-17 21:11:47 +00008607 item = PyUnicode_FromUnicode(
8608 PyUnicode_AS_UNICODE(seq)+it->it_index, 1);
Guido van Rossum50e9fb92006-08-17 05:42:55 +00008609 if (item != NULL)
8610 ++it->it_index;
8611 return item;
8612 }
8613
8614 Py_DECREF(seq);
8615 it->it_seq = NULL;
8616 return NULL;
8617}
8618
8619static PyObject *
8620unicodeiter_len(unicodeiterobject *it)
8621{
8622 Py_ssize_t len = 0;
8623 if (it->it_seq)
8624 len = PyUnicode_GET_SIZE(it->it_seq) - it->it_index;
8625 return PyInt_FromSsize_t(len);
8626}
8627
8628PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
8629
8630static PyMethodDef unicodeiter_methods[] = {
Guido van Rossum49d6b072006-08-17 21:11:47 +00008631 {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
8632 length_hint_doc},
Guido van Rossum50e9fb92006-08-17 05:42:55 +00008633 {NULL, NULL} /* sentinel */
8634};
8635
8636PyTypeObject PyUnicodeIter_Type = {
8637 PyObject_HEAD_INIT(&PyType_Type)
8638 0, /* ob_size */
8639 "unicodeiterator", /* tp_name */
8640 sizeof(unicodeiterobject), /* tp_basicsize */
8641 0, /* tp_itemsize */
8642 /* methods */
Guido van Rossum49d6b072006-08-17 21:11:47 +00008643 (destructor)unicodeiter_dealloc, /* tp_dealloc */
Guido van Rossum50e9fb92006-08-17 05:42:55 +00008644 0, /* tp_print */
8645 0, /* tp_getattr */
8646 0, /* tp_setattr */
8647 0, /* tp_compare */
8648 0, /* tp_repr */
8649 0, /* tp_as_number */
8650 0, /* tp_as_sequence */
8651 0, /* tp_as_mapping */
8652 0, /* tp_hash */
8653 0, /* tp_call */
8654 0, /* tp_str */
8655 PyObject_GenericGetAttr, /* tp_getattro */
8656 0, /* tp_setattro */
8657 0, /* tp_as_buffer */
8658 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
8659 0, /* tp_doc */
8660 (traverseproc)unicodeiter_traverse, /* tp_traverse */
8661 0, /* tp_clear */
8662 0, /* tp_richcompare */
8663 0, /* tp_weaklistoffset */
8664 PyObject_SelfIter, /* tp_iter */
8665 (iternextfunc)unicodeiter_next, /* tp_iternext */
8666 unicodeiter_methods, /* tp_methods */
8667 0,
8668};
8669
8670static PyObject *
8671unicode_iter(PyObject *seq)
8672{
8673 unicodeiterobject *it;
8674
8675 if (!PyUnicode_Check(seq)) {
8676 PyErr_BadInternalCall();
8677 return NULL;
8678 }
8679 it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
8680 if (it == NULL)
8681 return NULL;
8682 it->it_index = 0;
8683 Py_INCREF(seq);
8684 it->it_seq = (PyUnicodeObject *)seq;
8685 _PyObject_GC_TRACK(it);
8686 return (PyObject *)it;
8687}
8688
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008689#ifdef __cplusplus
8690}
8691#endif
8692
8693
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00008694/*
8695Local variables:
8696c-basic-offset: 4
8697indent-tabs-mode: nil
8698End:
8699*/