blob: 9937705eef1da4e0ce35e3f2b6e21bfe63cae4d7 [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Fred Drake785d14f2000-05-09 19:54:43 +00004modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
Guido van Rossumd57fd912000-03-10 22:53:23 +00005Unicode Integration Proposal (see file Misc/unicode.txt).
6
Thomas Wouters477c8d52006-05-27 19:21:47 +00007Major speed upgrades to the method implementations at the Reykjavik
8NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
9
Guido van Rossum16b1ad92000-08-03 16:24:25 +000010Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000011
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000012--------------------------------------------------------------------
13The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000014
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000015 Copyright (c) 1999 by Secret Labs AB
16 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000017
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000018By obtaining, using, and/or copying this software and/or its
19associated documentation, you agree that you have read, understood,
20and will comply with the following terms and conditions:
21
22Permission to use, copy, modify, and distribute this software and its
23associated documentation for any purpose and without fee is hereby
24granted, provided that the above copyright notice appears in all
25copies, and that both that copyright notice and this permission notice
26appear in supporting documentation, and that the name of Secret Labs
27AB or the author not be used in advertising or publicity pertaining to
28distribution of the software without specific, written prior
29permission.
30
31SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
32THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
33FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
34ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
35WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
36ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
37OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
38--------------------------------------------------------------------
39
40*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000041
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000042#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000043#include "Python.h"
44
Guido van Rossumd57fd912000-03-10 22:53:23 +000045#include "unicodeobject.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000046#include "ucnhash.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000047
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000048#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000049#include <windows.h>
50#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000051
Guido van Rossumd57fd912000-03-10 22:53:23 +000052/* Limit for the Unicode object free list */
53
54#define MAX_UNICODE_FREELIST_SIZE 1024
55
56/* Limit for the Unicode object free list stay alive optimization.
57
58 The implementation will keep allocated Unicode memory intact for
59 all objects on the free list having a size less than this
Tim Petersced69f82003-09-16 20:30:58 +000060 limit. This reduces malloc() overhead for small Unicode objects.
Guido van Rossumd57fd912000-03-10 22:53:23 +000061
Barry Warsaw51ac5802000-03-20 16:36:48 +000062 At worst this will result in MAX_UNICODE_FREELIST_SIZE *
Guido van Rossumfd4b9572000-04-10 13:51:10 +000063 (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
Guido van Rossumd57fd912000-03-10 22:53:23 +000064 malloc()-overhead) bytes of unused garbage.
65
66 Setting the limit to 0 effectively turns the feature off.
67
Guido van Rossumfd4b9572000-04-10 13:51:10 +000068 Note: This is an experimental feature ! If you get core dumps when
69 using Unicode objects, turn this feature off.
Guido van Rossumd57fd912000-03-10 22:53:23 +000070
71*/
72
Guido van Rossumfd4b9572000-04-10 13:51:10 +000073#define KEEPALIVE_SIZE_LIMIT 9
Guido van Rossumd57fd912000-03-10 22:53:23 +000074
75/* Endianness switches; defaults to little endian */
76
77#ifdef WORDS_BIGENDIAN
78# define BYTEORDER_IS_BIG_ENDIAN
79#else
80# define BYTEORDER_IS_LITTLE_ENDIAN
81#endif
82
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000083/* --- Globals ------------------------------------------------------------
84
85 The globals are initialized by the _PyUnicode_Init() API and should
86 not be used before calling that API.
87
88*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000089
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000090
91#ifdef __cplusplus
92extern "C" {
93#endif
94
Guido van Rossumd57fd912000-03-10 22:53:23 +000095/* Free list for Unicode objects */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000096static PyUnicodeObject *unicode_freelist;
97static int unicode_freelist_size;
Guido van Rossumd57fd912000-03-10 22:53:23 +000098
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000099/* The empty Unicode object is shared to improve performance. */
100static PyUnicodeObject *unicode_empty;
101
102/* Single character Unicode strings in the Latin-1 range are being
103 shared as well. */
104static PyUnicodeObject *unicode_latin1[256];
105
Fred Drakee4315f52000-05-09 19:53:39 +0000106/* Default encoding to use and assume when NULL is passed as encoding
Guido van Rossumf15a29f2007-05-04 00:41:39 +0000107 parameter; it is fixed to "utf-8". Always use the
108 PyUnicode_GetDefaultEncoding() API to access this global. */
109static const char unicode_default_encoding[] = "utf-8";
Fred Drakee4315f52000-05-09 19:53:39 +0000110
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000111Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000112PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000113{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000114#ifdef Py_UNICODE_WIDE
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000115 return 0x10FFFF;
116#else
117 /* This is actually an illegal character, so it should
118 not be passed to unichr. */
119 return 0xFFFF;
120#endif
121}
122
Thomas Wouters477c8d52006-05-27 19:21:47 +0000123/* --- Bloom Filters ----------------------------------------------------- */
124
125/* stuff to implement simple "bloom filters" for Unicode characters.
126 to keep things simple, we use a single bitmask, using the least 5
127 bits from each unicode characters as the bit index. */
128
129/* the linebreak mask is set up by Unicode_Init below */
130
131#define BLOOM_MASK unsigned long
132
133static BLOOM_MASK bloom_linebreak;
134
135#define BLOOM(mask, ch) ((mask & (1 << ((ch) & 0x1F))))
136
137#define BLOOM_LINEBREAK(ch)\
138 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK((ch)))
139
140Py_LOCAL_INLINE(BLOOM_MASK) make_bloom_mask(Py_UNICODE* ptr, Py_ssize_t len)
141{
142 /* calculate simple bloom-style bitmask for a given unicode string */
143
144 long mask;
145 Py_ssize_t i;
146
147 mask = 0;
148 for (i = 0; i < len; i++)
149 mask |= (1 << (ptr[i] & 0x1F));
150
151 return mask;
152}
153
154Py_LOCAL_INLINE(int) unicode_member(Py_UNICODE chr, Py_UNICODE* set, Py_ssize_t setlen)
155{
156 Py_ssize_t i;
157
158 for (i = 0; i < setlen; i++)
159 if (set[i] == chr)
160 return 1;
161
162 return 0;
163}
164
165#define BLOOM_MEMBER(mask, chr, set, setlen)\
166 BLOOM(mask, chr) && unicode_member(chr, set, setlen)
167
Guido van Rossumd57fd912000-03-10 22:53:23 +0000168/* --- Unicode Object ----------------------------------------------------- */
169
170static
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000171int unicode_resize(register PyUnicodeObject *unicode,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000172 Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000173{
174 void *oldstr;
Tim Petersced69f82003-09-16 20:30:58 +0000175
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000176 /* Shortcut if there's nothing much to do. */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000177 if (unicode->length == length)
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000178 goto reset;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000179
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000180 /* Resizing shared object (unicode_empty or single character
181 objects) in-place is not allowed. Use PyUnicode_Resize()
182 instead ! */
Thomas Wouters477c8d52006-05-27 19:21:47 +0000183
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000184 if (unicode == unicode_empty ||
185 (unicode->length == 1 &&
186 unicode->str[0] < 256U &&
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000187 unicode_latin1[unicode->str[0]] == unicode)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000188 PyErr_SetString(PyExc_SystemError,
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000189 "can't resize shared unicode objects");
Guido van Rossumd57fd912000-03-10 22:53:23 +0000190 return -1;
191 }
192
Thomas Wouters477c8d52006-05-27 19:21:47 +0000193 /* We allocate one more byte to make sure the string is Ux0000 terminated.
194 The overallocation is also used by fastsearch, which assumes that it's
195 safe to look at str[length] (without making any assumptions about what
196 it contains). */
197
Guido van Rossumd57fd912000-03-10 22:53:23 +0000198 oldstr = unicode->str;
199 PyMem_RESIZE(unicode->str, Py_UNICODE, length + 1);
200 if (!unicode->str) {
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000201 unicode->str = (Py_UNICODE *)oldstr;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000202 PyErr_NoMemory();
203 return -1;
204 }
205 unicode->str[length] = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000206 unicode->length = length;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000207
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000208 reset:
Guido van Rossumd57fd912000-03-10 22:53:23 +0000209 /* Reset the object caches */
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000210 if (unicode->defenc) {
211 Py_DECREF(unicode->defenc);
212 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000213 }
214 unicode->hash = -1;
Tim Petersced69f82003-09-16 20:30:58 +0000215
Guido van Rossumd57fd912000-03-10 22:53:23 +0000216 return 0;
217}
218
219/* We allocate one more byte to make sure the string is
Tim Petersced69f82003-09-16 20:30:58 +0000220 Ux0000 terminated -- XXX is this needed ?
Guido van Rossumd57fd912000-03-10 22:53:23 +0000221
222 XXX This allocator could further be enhanced by assuring that the
223 free list never reduces its size below 1.
224
225*/
226
227static
Martin v. Löwis18e16552006-02-15 17:27:45 +0000228PyUnicodeObject *_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000229{
230 register PyUnicodeObject *unicode;
231
Thomas Wouters477c8d52006-05-27 19:21:47 +0000232 /* Optimization for empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000233 if (length == 0 && unicode_empty != NULL) {
234 Py_INCREF(unicode_empty);
235 return unicode_empty;
236 }
237
238 /* Unicode freelist & memory allocation */
239 if (unicode_freelist) {
240 unicode = unicode_freelist;
Marc-André Lemburgbea47e72000-06-17 20:31:17 +0000241 unicode_freelist = *(PyUnicodeObject **)unicode;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000242 unicode_freelist_size--;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000243 if (unicode->str) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000244 /* Keep-Alive optimization: we only upsize the buffer,
245 never downsize it. */
Tim Petersced69f82003-09-16 20:30:58 +0000246 if ((unicode->length < length) &&
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +0000247 unicode_resize(unicode, length) < 0) {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000248 PyMem_DEL(unicode->str);
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000249 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000250 }
251 }
Guido van Rossumad98db12001-06-14 17:52:02 +0000252 else {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000253 unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
Guido van Rossumad98db12001-06-14 17:52:02 +0000254 }
255 PyObject_INIT(unicode, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000256 }
257 else {
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000258 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000259 if (unicode == NULL)
260 return NULL;
261 unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
262 }
263
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000264 if (!unicode->str) {
265 PyErr_NoMemory();
Barry Warsaw51ac5802000-03-20 16:36:48 +0000266 goto onError;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000267 }
Jeremy Hyltond8082792003-09-16 19:41:39 +0000268 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +0000269 * the caller fails before initializing str -- unicode_resize()
270 * reads str[0], and the Keep-Alive optimization can keep memory
271 * allocated for str alive across a call to unicode_dealloc(unicode).
272 * We don't want unicode_resize to read uninitialized memory in
273 * that case.
274 */
Jeremy Hyltond8082792003-09-16 19:41:39 +0000275 unicode->str[0] = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000276 unicode->str[length] = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000277 unicode->length = length;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000278 unicode->hash = -1;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000279 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000280 return unicode;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000281
282 onError:
283 _Py_ForgetReference((PyObject *)unicode);
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000284 PyObject_Del(unicode);
Barry Warsaw51ac5802000-03-20 16:36:48 +0000285 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000286}
287
288static
Guido van Rossum9475a232001-10-05 20:51:39 +0000289void unicode_dealloc(register PyUnicodeObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000290{
Guido van Rossum604ddf82001-12-06 20:03:56 +0000291 if (PyUnicode_CheckExact(unicode) &&
292 unicode_freelist_size < MAX_UNICODE_FREELIST_SIZE) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000293 /* Keep-Alive optimization */
294 if (unicode->length >= KEEPALIVE_SIZE_LIMIT) {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000295 PyMem_DEL(unicode->str);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000296 unicode->str = NULL;
297 unicode->length = 0;
298 }
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000299 if (unicode->defenc) {
300 Py_DECREF(unicode->defenc);
301 unicode->defenc = NULL;
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000302 }
303 /* Add to free list */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000304 *(PyUnicodeObject **)unicode = unicode_freelist;
305 unicode_freelist = unicode;
306 unicode_freelist_size++;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000307 }
308 else {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000309 PyMem_DEL(unicode->str);
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000310 Py_XDECREF(unicode->defenc);
Guido van Rossum604ddf82001-12-06 20:03:56 +0000311 unicode->ob_type->tp_free((PyObject *)unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000312 }
313}
314
Martin v. Löwis18e16552006-02-15 17:27:45 +0000315int PyUnicode_Resize(PyObject **unicode, Py_ssize_t length)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000316{
317 register PyUnicodeObject *v;
318
319 /* Argument checks */
320 if (unicode == NULL) {
321 PyErr_BadInternalCall();
322 return -1;
323 }
324 v = (PyUnicodeObject *)*unicode;
Guido van Rossum049cd6b2002-10-11 00:43:48 +0000325 if (v == NULL || !PyUnicode_Check(v) || v->ob_refcnt != 1 || length < 0) {
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000326 PyErr_BadInternalCall();
327 return -1;
328 }
329
330 /* Resizing unicode_empty and single character objects is not
331 possible since these are being shared. We simply return a fresh
332 copy with the same Unicode content. */
Tim Petersced69f82003-09-16 20:30:58 +0000333 if (v->length != length &&
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000334 (v == unicode_empty || v->length == 1)) {
335 PyUnicodeObject *w = _PyUnicode_New(length);
336 if (w == NULL)
337 return -1;
338 Py_UNICODE_COPY(w->str, v->str,
339 length < v->length ? length : v->length);
Raymond Hettingerc8df5782003-03-09 07:30:43 +0000340 Py_DECREF(*unicode);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000341 *unicode = (PyObject *)w;
342 return 0;
343 }
344
345 /* Note that we don't have to modify *unicode for unshared Unicode
346 objects, since we can modify them in-place. */
347 return unicode_resize(v, length);
348}
349
350/* Internal API for use in unicodeobject.c only ! */
351#define _PyUnicode_Resize(unicodevar, length) \
352 PyUnicode_Resize(((PyObject **)(unicodevar)), length)
353
Guido van Rossumd57fd912000-03-10 22:53:23 +0000354PyObject *PyUnicode_FromUnicode(const Py_UNICODE *u,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000355 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000356{
357 PyUnicodeObject *unicode;
358
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000359 /* If the Unicode data is known at construction time, we can apply
360 some optimizations which share commonly used objects. */
361 if (u != NULL) {
362
363 /* Optimization for empty strings */
364 if (size == 0 && unicode_empty != NULL) {
365 Py_INCREF(unicode_empty);
366 return (PyObject *)unicode_empty;
367 }
368
369 /* Single character Unicode objects in the Latin-1 range are
370 shared when using this constructor */
371 if (size == 1 && *u < 256) {
372 unicode = unicode_latin1[*u];
373 if (!unicode) {
374 unicode = _PyUnicode_New(1);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000375 if (!unicode)
376 return NULL;
Marc-André Lemburg8879a332001-06-07 12:26:56 +0000377 unicode->str[0] = *u;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000378 unicode_latin1[*u] = unicode;
379 }
380 Py_INCREF(unicode);
381 return (PyObject *)unicode;
382 }
383 }
Tim Petersced69f82003-09-16 20:30:58 +0000384
Guido van Rossumd57fd912000-03-10 22:53:23 +0000385 unicode = _PyUnicode_New(size);
386 if (!unicode)
387 return NULL;
388
389 /* Copy the Unicode data into the new object */
390 if (u != NULL)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000391 Py_UNICODE_COPY(unicode->str, u, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000392
393 return (PyObject *)unicode;
394}
395
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000396PyObject *PyUnicode_FromString(const char *u)
397{
398 PyUnicodeObject *unicode;
Walter Dörwalda14c4bb2007-05-06 10:00:02 +0000399 size_t size = strlen(u);
400 if (size > PY_SSIZE_T_MAX) {
401 PyErr_SetString(PyExc_OverflowError, "input too long");
402 return NULL;
403 }
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000404
405 /* If the Unicode data is known at construction time, we can apply
406 some optimizations which share commonly used objects. */
407 if (u != NULL) {
408
409 /* Optimization for empty strings */
410 if (size == 0 && unicode_empty != NULL) {
411 Py_INCREF(unicode_empty);
412 return (PyObject *)unicode_empty;
413 }
414
Walter Dörwald071b9da2007-05-05 14:21:20 +0000415 /* Single characters are shared when using this constructor */
416 if (size == 1) {
Walter Dörwaldce32db32007-05-05 14:26:59 +0000417 unicode = unicode_latin1[(int)*u];
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000418 if (!unicode) {
419 unicode = _PyUnicode_New(1);
420 if (!unicode)
421 return NULL;
422 unicode->str[0] = *u;
Walter Dörwaldce32db32007-05-05 14:26:59 +0000423 unicode_latin1[(int)*u] = unicode;
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000424 }
425 Py_INCREF(unicode);
426 return (PyObject *)unicode;
427 }
428 }
429
Walter Dörwald99928352007-05-18 11:30:40 +0000430 unicode = _PyUnicode_New(size+1);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000431 if (!unicode)
432 return NULL;
433
434 /* Copy the Unicode data into the new object */
435 if (u != NULL) {
Walter Dörwaldce32db32007-05-05 14:26:59 +0000436 Py_UNICODE *p = unicode->str;
437 while ((*p++ = *u++))
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000438 ;
439 }
440
441 return (PyObject *)unicode;
442}
443
Guido van Rossumd57fd912000-03-10 22:53:23 +0000444#ifdef HAVE_WCHAR_H
445
446PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000447 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000448{
449 PyUnicodeObject *unicode;
450
451 if (w == NULL) {
452 PyErr_BadInternalCall();
453 return NULL;
454 }
455
456 unicode = _PyUnicode_New(size);
457 if (!unicode)
458 return NULL;
459
460 /* Copy the wchar_t data into the new object */
461#ifdef HAVE_USABLE_WCHAR_T
462 memcpy(unicode->str, w, size * sizeof(wchar_t));
Tim Petersced69f82003-09-16 20:30:58 +0000463#else
Guido van Rossumd57fd912000-03-10 22:53:23 +0000464 {
465 register Py_UNICODE *u;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000466 register Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000467 u = PyUnicode_AS_UNICODE(unicode);
Marc-André Lemburg204bd6d2004-10-15 07:45:05 +0000468 for (i = size; i > 0; i--)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000469 *u++ = *w++;
470 }
471#endif
472
473 return (PyObject *)unicode;
474}
475
Martin v. Löwis18e16552006-02-15 17:27:45 +0000476Py_ssize_t PyUnicode_AsWideChar(PyUnicodeObject *unicode,
477 wchar_t *w,
478 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000479{
480 if (unicode == NULL) {
481 PyErr_BadInternalCall();
482 return -1;
483 }
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +0000484
485 /* If possible, try to copy the 0-termination as well */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000486 if (size > PyUnicode_GET_SIZE(unicode))
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +0000487 size = PyUnicode_GET_SIZE(unicode) + 1;
488
Guido van Rossumd57fd912000-03-10 22:53:23 +0000489#ifdef HAVE_USABLE_WCHAR_T
490 memcpy(w, unicode->str, size * sizeof(wchar_t));
491#else
492 {
493 register Py_UNICODE *u;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000494 register Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000495 u = PyUnicode_AS_UNICODE(unicode);
Marc-André Lemburg204bd6d2004-10-15 07:45:05 +0000496 for (i = size; i > 0; i--)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000497 *w++ = *u++;
498 }
499#endif
500
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +0000501 if (size > PyUnicode_GET_SIZE(unicode))
502 return PyUnicode_GET_SIZE(unicode);
503 else
Guido van Rossumd57fd912000-03-10 22:53:23 +0000504 return size;
505}
506
507#endif
508
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000509PyObject *PyUnicode_FromOrdinal(int ordinal)
510{
Hye-Shik Chang40574832004-04-06 07:24:51 +0000511 Py_UNICODE s[1];
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000512
513#ifdef Py_UNICODE_WIDE
514 if (ordinal < 0 || ordinal > 0x10ffff) {
515 PyErr_SetString(PyExc_ValueError,
516 "unichr() arg not in range(0x110000) "
517 "(wide Python build)");
518 return NULL;
519 }
520#else
521 if (ordinal < 0 || ordinal > 0xffff) {
522 PyErr_SetString(PyExc_ValueError,
523 "unichr() arg not in range(0x10000) "
524 "(narrow Python build)");
525 return NULL;
526 }
527#endif
528
Hye-Shik Chang40574832004-04-06 07:24:51 +0000529 s[0] = (Py_UNICODE)ordinal;
530 return PyUnicode_FromUnicode(s, 1);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000531}
532
Guido van Rossumd57fd912000-03-10 22:53:23 +0000533PyObject *PyUnicode_FromObject(register PyObject *obj)
534{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000535 /* XXX Perhaps we should make this API an alias of
536 PyObject_Unicode() instead ?! */
537 if (PyUnicode_CheckExact(obj)) {
538 Py_INCREF(obj);
539 return obj;
540 }
541 if (PyUnicode_Check(obj)) {
542 /* For a Unicode subtype that's not a Unicode object,
543 return a true Unicode object with the same data. */
544 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(obj),
545 PyUnicode_GET_SIZE(obj));
546 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000547 return PyUnicode_FromEncodedObject(obj, NULL, "strict");
548}
549
550PyObject *PyUnicode_FromEncodedObject(register PyObject *obj,
551 const char *encoding,
552 const char *errors)
553{
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000554 const char *s = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +0000555 Py_ssize_t len;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000556 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +0000557
Guido van Rossumd57fd912000-03-10 22:53:23 +0000558 if (obj == NULL) {
559 PyErr_BadInternalCall();
560 return NULL;
561 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000562
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000563#if 0
564 /* For b/w compatibility we also accept Unicode objects provided
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +0000565 that no encodings is given and then redirect to
566 PyObject_Unicode() which then applies the additional logic for
567 Unicode subclasses.
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000568
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000569 NOTE: This API should really only be used for object which
570 represent *encoded* Unicode !
571
572 */
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000573 if (PyUnicode_Check(obj)) {
574 if (encoding) {
575 PyErr_SetString(PyExc_TypeError,
576 "decoding Unicode is not supported");
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000577 return NULL;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000578 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000579 return PyObject_Unicode(obj);
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000580 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000581#else
582 if (PyUnicode_Check(obj)) {
583 PyErr_SetString(PyExc_TypeError,
584 "decoding Unicode is not supported");
585 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000586 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000587#endif
588
589 /* Coerce object */
590 if (PyString_Check(obj)) {
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000591 s = PyString_AS_STRING(obj);
592 len = PyString_GET_SIZE(obj);
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000593 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000594 else if (PyObject_AsCharBuffer(obj, &s, &len)) {
595 /* Overwrite the error message with something more useful in
596 case of a TypeError. */
597 if (PyErr_ExceptionMatches(PyExc_TypeError))
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000598 PyErr_Format(PyExc_TypeError,
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000599 "coercing to Unicode: need string or buffer, "
600 "%.80s found",
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000601 obj->ob_type->tp_name);
602 goto onError;
603 }
Tim Petersced69f82003-09-16 20:30:58 +0000604
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000605 /* Convert to Unicode */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000606 if (len == 0) {
607 Py_INCREF(unicode_empty);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000608 v = (PyObject *)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000609 }
Tim Petersced69f82003-09-16 20:30:58 +0000610 else
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000611 v = PyUnicode_Decode(s, len, encoding, errors);
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +0000612
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000613 return v;
614
615 onError:
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000616 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000617}
618
619PyObject *PyUnicode_Decode(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000620 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +0000621 const char *encoding,
622 const char *errors)
623{
624 PyObject *buffer = NULL, *unicode;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000625
626 if (encoding == NULL)
Fred Drakee4315f52000-05-09 19:53:39 +0000627 encoding = PyUnicode_GetDefaultEncoding();
628
629 /* Shortcuts for common default encodings */
630 if (strcmp(encoding, "utf-8") == 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000631 return PyUnicode_DecodeUTF8(s, size, errors);
Fred Drakee4315f52000-05-09 19:53:39 +0000632 else if (strcmp(encoding, "latin-1") == 0)
633 return PyUnicode_DecodeLatin1(s, size, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +0000634#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
635 else if (strcmp(encoding, "mbcs") == 0)
636 return PyUnicode_DecodeMBCS(s, size, errors);
637#endif
Fred Drakee4315f52000-05-09 19:53:39 +0000638 else if (strcmp(encoding, "ascii") == 0)
639 return PyUnicode_DecodeASCII(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000640
641 /* Decode via the codec registry */
642 buffer = PyBuffer_FromMemory((void *)s, size);
643 if (buffer == NULL)
644 goto onError;
645 unicode = PyCodec_Decode(buffer, encoding, errors);
646 if (unicode == NULL)
647 goto onError;
648 if (!PyUnicode_Check(unicode)) {
649 PyErr_Format(PyExc_TypeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000650 "decoder did not return an unicode object (type=%.400s)",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000651 unicode->ob_type->tp_name);
652 Py_DECREF(unicode);
653 goto onError;
654 }
655 Py_DECREF(buffer);
656 return unicode;
Tim Petersced69f82003-09-16 20:30:58 +0000657
Guido van Rossumd57fd912000-03-10 22:53:23 +0000658 onError:
659 Py_XDECREF(buffer);
660 return NULL;
661}
662
Marc-André Lemburgd2d45982004-07-08 17:57:32 +0000663PyObject *PyUnicode_AsDecodedObject(PyObject *unicode,
664 const char *encoding,
665 const char *errors)
666{
667 PyObject *v;
668
669 if (!PyUnicode_Check(unicode)) {
670 PyErr_BadArgument();
671 goto onError;
672 }
673
674 if (encoding == NULL)
675 encoding = PyUnicode_GetDefaultEncoding();
676
677 /* Decode via the codec registry */
678 v = PyCodec_Decode(unicode, encoding, errors);
679 if (v == NULL)
680 goto onError;
681 return v;
682
683 onError:
684 return NULL;
685}
686
Guido van Rossumd57fd912000-03-10 22:53:23 +0000687PyObject *PyUnicode_Encode(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000688 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +0000689 const char *encoding,
690 const char *errors)
691{
692 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +0000693
Guido van Rossumd57fd912000-03-10 22:53:23 +0000694 unicode = PyUnicode_FromUnicode(s, size);
695 if (unicode == NULL)
696 return NULL;
697 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
698 Py_DECREF(unicode);
699 return v;
700}
701
Marc-André Lemburgd2d45982004-07-08 17:57:32 +0000702PyObject *PyUnicode_AsEncodedObject(PyObject *unicode,
703 const char *encoding,
704 const char *errors)
705{
706 PyObject *v;
707
708 if (!PyUnicode_Check(unicode)) {
709 PyErr_BadArgument();
710 goto onError;
711 }
712
713 if (encoding == NULL)
714 encoding = PyUnicode_GetDefaultEncoding();
715
716 /* Encode via the codec registry */
717 v = PyCodec_Encode(unicode, encoding, errors);
718 if (v == NULL)
719 goto onError;
720 return v;
721
722 onError:
723 return NULL;
724}
725
Guido van Rossumd57fd912000-03-10 22:53:23 +0000726PyObject *PyUnicode_AsEncodedString(PyObject *unicode,
727 const char *encoding,
728 const char *errors)
729{
730 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +0000731
Guido van Rossumd57fd912000-03-10 22:53:23 +0000732 if (!PyUnicode_Check(unicode)) {
733 PyErr_BadArgument();
734 goto onError;
735 }
Fred Drakee4315f52000-05-09 19:53:39 +0000736
Tim Petersced69f82003-09-16 20:30:58 +0000737 if (encoding == NULL)
Fred Drakee4315f52000-05-09 19:53:39 +0000738 encoding = PyUnicode_GetDefaultEncoding();
739
740 /* Shortcuts for common default encodings */
741 if (errors == NULL) {
742 if (strcmp(encoding, "utf-8") == 0)
Jeremy Hylton9cea41c2001-05-29 17:13:15 +0000743 return PyUnicode_AsUTF8String(unicode);
Fred Drakee4315f52000-05-09 19:53:39 +0000744 else if (strcmp(encoding, "latin-1") == 0)
745 return PyUnicode_AsLatin1String(unicode);
Mark Hammond0ccda1e2003-07-01 00:13:27 +0000746#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
747 else if (strcmp(encoding, "mbcs") == 0)
748 return PyUnicode_AsMBCSString(unicode);
749#endif
Fred Drakee4315f52000-05-09 19:53:39 +0000750 else if (strcmp(encoding, "ascii") == 0)
751 return PyUnicode_AsASCIIString(unicode);
752 }
Guido van Rossumd57fd912000-03-10 22:53:23 +0000753
754 /* Encode via the codec registry */
755 v = PyCodec_Encode(unicode, encoding, errors);
756 if (v == NULL)
757 goto onError;
Guido van Rossumf15a29f2007-05-04 00:41:39 +0000758 if (!PyBytes_Check(v)) {
759 if (PyString_Check(v)) {
760 /* Old codec, turn it into bytes */
761 PyObject *b = PyBytes_FromObject(v);
762 Py_DECREF(v);
763 return b;
764 }
Guido van Rossumd57fd912000-03-10 22:53:23 +0000765 PyErr_Format(PyExc_TypeError,
Guido van Rossumf15a29f2007-05-04 00:41:39 +0000766 "encoder did not return a bytes object "
767 "(type=%.400s, encoding=%.20s, errors=%.20s)",
768 v->ob_type->tp_name,
769 encoding ? encoding : "NULL",
770 errors ? errors : "NULL");
Guido van Rossumd57fd912000-03-10 22:53:23 +0000771 Py_DECREF(v);
772 goto onError;
773 }
774 return v;
Tim Petersced69f82003-09-16 20:30:58 +0000775
Guido van Rossumd57fd912000-03-10 22:53:23 +0000776 onError:
777 return NULL;
778}
779
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000780PyObject *_PyUnicode_AsDefaultEncodedString(PyObject *unicode,
781 const char *errors)
782{
783 PyObject *v = ((PyUnicodeObject *)unicode)->defenc;
Guido van Rossumf15a29f2007-05-04 00:41:39 +0000784 PyObject *b;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000785 if (v)
786 return v;
Guido van Rossumf15a29f2007-05-04 00:41:39 +0000787 if (errors != NULL)
788 Py_FatalError("non-NULL encoding in _PyUnicode_AsDefaultEncodedString");
789 if (errors == NULL) {
790 b = PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
791 PyUnicode_GET_SIZE(unicode),
792 NULL);
793 }
794 else {
795 b = PyUnicode_AsEncodedString(unicode, NULL, errors);
796 }
797 if (!b)
798 return NULL;
799 v = PyString_FromStringAndSize(PyBytes_AsString(b),
800 PyBytes_Size(b));
801 Py_DECREF(b);
802 if (!errors) {
803 Py_XINCREF(v);
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000804 ((PyUnicodeObject *)unicode)->defenc = v;
Guido van Rossumf15a29f2007-05-04 00:41:39 +0000805 }
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000806 return v;
807}
808
Guido van Rossumd57fd912000-03-10 22:53:23 +0000809Py_UNICODE *PyUnicode_AsUnicode(PyObject *unicode)
810{
811 if (!PyUnicode_Check(unicode)) {
812 PyErr_BadArgument();
813 goto onError;
814 }
815 return PyUnicode_AS_UNICODE(unicode);
816
817 onError:
818 return NULL;
819}
820
Martin v. Löwis18e16552006-02-15 17:27:45 +0000821Py_ssize_t PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000822{
823 if (!PyUnicode_Check(unicode)) {
824 PyErr_BadArgument();
825 goto onError;
826 }
827 return PyUnicode_GET_SIZE(unicode);
828
829 onError:
830 return -1;
831}
832
Thomas Wouters78890102000-07-22 19:25:51 +0000833const char *PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +0000834{
835 return unicode_default_encoding;
836}
837
838int PyUnicode_SetDefaultEncoding(const char *encoding)
839{
Guido van Rossumf15a29f2007-05-04 00:41:39 +0000840 if (strcmp(encoding, unicode_default_encoding) != 0) {
841 PyErr_Format(PyExc_ValueError,
842 "Can only set default encoding to %s",
843 unicode_default_encoding);
844 return -1;
845 }
Fred Drakee4315f52000-05-09 19:53:39 +0000846 return 0;
Fred Drakee4315f52000-05-09 19:53:39 +0000847}
848
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000849/* error handling callback helper:
850 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +0000851 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000852 and adjust various state variables.
853 return 0 on success, -1 on error
854*/
855
856static
857int unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
858 const char *encoding, const char *reason,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000859 const char *input, Py_ssize_t insize, Py_ssize_t *startinpos, Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
860 PyObject **output, Py_ssize_t *outpos, Py_UNICODE **outptr)
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000861{
Martin v. Löwis18e16552006-02-15 17:27:45 +0000862 static char *argparse = "O!n;decoding error handler must return (unicode, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000863
864 PyObject *restuple = NULL;
865 PyObject *repunicode = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +0000866 Py_ssize_t outsize = PyUnicode_GET_SIZE(*output);
867 Py_ssize_t requiredsize;
868 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000869 Py_UNICODE *repptr;
Martin v. Löwis18e16552006-02-15 17:27:45 +0000870 Py_ssize_t repsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000871 int res = -1;
872
873 if (*errorHandler == NULL) {
874 *errorHandler = PyCodec_LookupError(errors);
875 if (*errorHandler == NULL)
876 goto onError;
877 }
878
879 if (*exceptionObject == NULL) {
880 *exceptionObject = PyUnicodeDecodeError_Create(
881 encoding, input, insize, *startinpos, *endinpos, reason);
882 if (*exceptionObject == NULL)
883 goto onError;
884 }
885 else {
886 if (PyUnicodeDecodeError_SetStart(*exceptionObject, *startinpos))
887 goto onError;
888 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, *endinpos))
889 goto onError;
890 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
891 goto onError;
892 }
893
894 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
895 if (restuple == NULL)
896 goto onError;
897 if (!PyTuple_Check(restuple)) {
898 PyErr_Format(PyExc_TypeError, &argparse[4]);
899 goto onError;
900 }
901 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
902 goto onError;
903 if (newpos<0)
Walter Dörwald2e0b18a2003-01-31 17:19:08 +0000904 newpos = insize+newpos;
905 if (newpos<0 || newpos>insize) {
Martin v. Löwis2c95cc62006-02-16 06:54:25 +0000906 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
Walter Dörwald2e0b18a2003-01-31 17:19:08 +0000907 goto onError;
908 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000909
910 /* need more space? (at least enough for what we
911 have+the replacement+the rest of the string (starting
912 at the new input position), so we won't have to check space
913 when there are no errors in the rest of the string) */
914 repptr = PyUnicode_AS_UNICODE(repunicode);
915 repsize = PyUnicode_GET_SIZE(repunicode);
916 requiredsize = *outpos + repsize + insize-newpos;
917 if (requiredsize > outsize) {
918 if (requiredsize<2*outsize)
919 requiredsize = 2*outsize;
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +0000920 if (PyUnicode_Resize(output, requiredsize) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000921 goto onError;
922 *outptr = PyUnicode_AS_UNICODE(*output) + *outpos;
923 }
924 *endinpos = newpos;
925 *inptr = input + newpos;
926 Py_UNICODE_COPY(*outptr, repptr, repsize);
927 *outptr += repsize;
928 *outpos += repsize;
929 /* we made it! */
930 res = 0;
931
932 onError:
933 Py_XDECREF(restuple);
934 return res;
935}
936
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000937/* --- UTF-7 Codec -------------------------------------------------------- */
938
939/* see RFC2152 for details */
940
Tim Petersced69f82003-09-16 20:30:58 +0000941static
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000942char utf7_special[128] = {
943 /* indicate whether a UTF-7 character is special i.e. cannot be directly
944 encoded:
945 0 - not special
946 1 - special
947 2 - whitespace (optional)
948 3 - RFC2152 Set O (optional) */
949 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 2, 1, 1,
950 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
951 2, 3, 3, 3, 3, 3, 3, 0, 0, 0, 3, 1, 0, 0, 0, 1,
952 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 0,
953 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
954 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 1, 3, 3, 3,
955 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
956 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 1, 1,
957
958};
959
Marc-André Lemburge115ec82005-10-19 22:33:31 +0000960/* Note: The comparison (c) <= 0 is a trick to work-around gcc
961 warnings about the comparison always being false; since
962 utf7_special[0] is 1, we can safely make that one comparison
963 true */
964
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000965#define SPECIAL(c, encodeO, encodeWS) \
Marc-André Lemburge115ec82005-10-19 22:33:31 +0000966 ((c) > 127 || (c) <= 0 || utf7_special[(c)] == 1 || \
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +0000967 (encodeWS && (utf7_special[(c)] == 2)) || \
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000968 (encodeO && (utf7_special[(c)] == 3)))
969
Marc-André Lemburge115ec82005-10-19 22:33:31 +0000970#define B64(n) \
971 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
972#define B64CHAR(c) \
973 (isalnum(c) || (c) == '+' || (c) == '/')
974#define UB64(c) \
975 ((c) == '+' ? 62 : (c) == '/' ? 63 : (c) >= 'a' ? \
976 (c) - 71 : (c) >= 'A' ? (c) - 65 : (c) + 4 )
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000977
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +0000978#define ENCODE(out, ch, bits) \
979 while (bits >= 6) { \
980 *out++ = B64(ch >> (bits-6)); \
981 bits -= 6; \
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000982 }
983
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +0000984#define DECODE(out, ch, bits, surrogate) \
985 while (bits >= 16) { \
986 Py_UNICODE outCh = (Py_UNICODE) ((ch >> (bits-16)) & 0xffff); \
987 bits -= 16; \
988 if (surrogate) { \
Marc-André Lemburge115ec82005-10-19 22:33:31 +0000989 /* We have already generated an error for the high surrogate \
990 so let's not bother seeing if the low surrogate is correct or not */ \
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +0000991 surrogate = 0; \
992 } else if (0xDC00 <= outCh && outCh <= 0xDFFF) { \
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000993 /* This is a surrogate pair. Unfortunately we can't represent \
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +0000994 it in a 16-bit character */ \
995 surrogate = 1; \
996 errmsg = "code pairs are not supported"; \
997 goto utf7Error; \
998 } else { \
999 *out++ = outCh; \
1000 } \
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001001 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001002
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001003PyObject *PyUnicode_DecodeUTF7(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001004 Py_ssize_t size,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001005 const char *errors)
1006{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001007 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001008 Py_ssize_t startinpos;
1009 Py_ssize_t endinpos;
1010 Py_ssize_t outpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001011 const char *e;
1012 PyUnicodeObject *unicode;
1013 Py_UNICODE *p;
1014 const char *errmsg = "";
1015 int inShift = 0;
1016 unsigned int bitsleft = 0;
1017 unsigned long charsleft = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001018 int surrogate = 0;
1019 PyObject *errorHandler = NULL;
1020 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001021
1022 unicode = _PyUnicode_New(size);
1023 if (!unicode)
1024 return NULL;
1025 if (size == 0)
1026 return (PyObject *)unicode;
1027
1028 p = unicode->str;
1029 e = s + size;
1030
1031 while (s < e) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001032 Py_UNICODE ch;
1033 restart:
1034 ch = *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001035
1036 if (inShift) {
1037 if ((ch == '-') || !B64CHAR(ch)) {
1038 inShift = 0;
1039 s++;
Tim Petersced69f82003-09-16 20:30:58 +00001040
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001041 /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
1042 if (bitsleft >= 6) {
1043 /* The shift sequence has a partial character in it. If
1044 bitsleft < 6 then we could just classify it as padding
1045 but that is not the case here */
1046
1047 errmsg = "partial character in shift sequence";
Tim Petersced69f82003-09-16 20:30:58 +00001048 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001049 }
1050 /* According to RFC2152 the remaining bits should be zero. We
Tim Petersced69f82003-09-16 20:30:58 +00001051 choose to signal an error/insert a replacement character
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001052 here so indicate the potential of a misencoded character. */
1053
1054 /* On x86, a << b == a << (b%32) so make sure that bitsleft != 0 */
1055 if (bitsleft && charsleft << (sizeof(charsleft) * 8 - bitsleft)) {
1056 errmsg = "non-zero padding bits in shift sequence";
Tim Petersced69f82003-09-16 20:30:58 +00001057 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001058 }
1059
1060 if (ch == '-') {
1061 if ((s < e) && (*(s) == '-')) {
Tim Petersced69f82003-09-16 20:30:58 +00001062 *p++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001063 inShift = 1;
1064 }
1065 } else if (SPECIAL(ch,0,0)) {
1066 errmsg = "unexpected special character";
Tim Petersced69f82003-09-16 20:30:58 +00001067 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001068 } else {
1069 *p++ = ch;
1070 }
1071 } else {
1072 charsleft = (charsleft << 6) | UB64(ch);
1073 bitsleft += 6;
1074 s++;
1075 /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
1076 }
1077 }
1078 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001079 startinpos = s-starts;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001080 s++;
1081 if (s < e && *s == '-') {
1082 s++;
1083 *p++ = '+';
1084 } else
1085 {
1086 inShift = 1;
1087 bitsleft = 0;
1088 }
1089 }
1090 else if (SPECIAL(ch,0,0)) {
1091 errmsg = "unexpected special character";
1092 s++;
Tim Petersced69f82003-09-16 20:30:58 +00001093 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001094 }
1095 else {
1096 *p++ = ch;
1097 s++;
1098 }
1099 continue;
1100 utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001101 outpos = p-PyUnicode_AS_UNICODE(unicode);
1102 endinpos = s-starts;
1103 if (unicode_decode_call_errorhandler(
1104 errors, &errorHandler,
1105 "utf7", errmsg,
1106 starts, size, &startinpos, &endinpos, &exc, &s,
1107 (PyObject **)&unicode, &outpos, &p))
1108 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001109 }
1110
1111 if (inShift) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001112 outpos = p-PyUnicode_AS_UNICODE(unicode);
1113 endinpos = size;
1114 if (unicode_decode_call_errorhandler(
1115 errors, &errorHandler,
1116 "utf7", "unterminated shift sequence",
1117 starts, size, &startinpos, &endinpos, &exc, &s,
1118 (PyObject **)&unicode, &outpos, &p))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001119 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001120 if (s < e)
1121 goto restart;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001122 }
1123
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00001124 if (_PyUnicode_Resize(&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001125 goto onError;
1126
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001127 Py_XDECREF(errorHandler);
1128 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001129 return (PyObject *)unicode;
1130
1131onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001132 Py_XDECREF(errorHandler);
1133 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001134 Py_DECREF(unicode);
1135 return NULL;
1136}
1137
1138
1139PyObject *PyUnicode_EncodeUTF7(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001140 Py_ssize_t size,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001141 int encodeSetO,
1142 int encodeWhiteSpace,
1143 const char *errors)
1144{
1145 PyObject *v;
1146 /* It might be possible to tighten this worst case */
Martin v. Löwis18e16552006-02-15 17:27:45 +00001147 Py_ssize_t cbAllocated = 5 * size;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001148 int inShift = 0;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001149 Py_ssize_t i = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001150 unsigned int bitsleft = 0;
1151 unsigned long charsleft = 0;
1152 char * out;
1153 char * start;
1154
1155 if (size == 0)
Walter Dörwald51ab4142007-05-05 14:43:36 +00001156 return PyBytes_FromStringAndSize(NULL, 0);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001157
Walter Dörwald51ab4142007-05-05 14:43:36 +00001158 v = PyBytes_FromStringAndSize(NULL, cbAllocated);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001159 if (v == NULL)
1160 return NULL;
1161
Walter Dörwald51ab4142007-05-05 14:43:36 +00001162 start = out = PyBytes_AS_STRING(v);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001163 for (;i < size; ++i) {
1164 Py_UNICODE ch = s[i];
1165
1166 if (!inShift) {
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001167 if (ch == '+') {
1168 *out++ = '+';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001169 *out++ = '-';
1170 } else if (SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
1171 charsleft = ch;
1172 bitsleft = 16;
1173 *out++ = '+';
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001174 /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001175 inShift = bitsleft > 0;
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001176 } else {
1177 *out++ = (char) ch;
1178 }
1179 } else {
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001180 if (!SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
1181 *out++ = B64(charsleft << (6-bitsleft));
1182 charsleft = 0;
1183 bitsleft = 0;
1184 /* Characters not in the BASE64 set implicitly unshift the sequence
1185 so no '-' is required, except if the character is itself a '-' */
1186 if (B64CHAR(ch) || ch == '-') {
1187 *out++ = '-';
1188 }
1189 inShift = 0;
1190 *out++ = (char) ch;
1191 } else {
1192 bitsleft += 16;
1193 charsleft = (charsleft << 16) | ch;
1194 /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
1195
1196 /* If the next character is special then we dont' need to terminate
Tim Petersced69f82003-09-16 20:30:58 +00001197 the shift sequence. If the next character is not a BASE64 character
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001198 or '-' then the shift sequence will be terminated implicitly and we
1199 don't have to insert a '-'. */
1200
1201 if (bitsleft == 0) {
1202 if (i + 1 < size) {
1203 Py_UNICODE ch2 = s[i+1];
1204
1205 if (SPECIAL(ch2, encodeSetO, encodeWhiteSpace)) {
Tim Petersced69f82003-09-16 20:30:58 +00001206
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001207 } else if (B64CHAR(ch2) || ch2 == '-') {
1208 *out++ = '-';
1209 inShift = 0;
1210 } else {
1211 inShift = 0;
1212 }
1213
1214 }
1215 else {
1216 *out++ = '-';
1217 inShift = 0;
1218 }
1219 }
Tim Petersced69f82003-09-16 20:30:58 +00001220 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001221 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001222 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001223 if (bitsleft) {
1224 *out++= B64(charsleft << (6-bitsleft) );
1225 *out++ = '-';
1226 }
1227
Walter Dörwald51ab4142007-05-05 14:43:36 +00001228 if (PyBytes_Resize(v, out - start)) {
1229 Py_DECREF(v);
1230 return NULL;
1231 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001232 return v;
1233}
1234
1235#undef SPECIAL
1236#undef B64
1237#undef B64CHAR
1238#undef UB64
1239#undef ENCODE
1240#undef DECODE
1241
Guido van Rossumd57fd912000-03-10 22:53:23 +00001242/* --- UTF-8 Codec -------------------------------------------------------- */
1243
Tim Petersced69f82003-09-16 20:30:58 +00001244static
Guido van Rossumd57fd912000-03-10 22:53:23 +00001245char utf8_code_length[256] = {
1246 /* Map UTF-8 encoded prefix byte to sequence length. zero means
1247 illegal prefix. see RFC 2279 for details */
1248 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1249 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1250 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1251 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1252 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1253 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1254 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1255 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1256 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1257 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1258 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1259 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1260 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1261 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1262 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
1263 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0
1264};
1265
Guido van Rossumd57fd912000-03-10 22:53:23 +00001266PyObject *PyUnicode_DecodeUTF8(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001267 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00001268 const char *errors)
1269{
Walter Dörwald69652032004-09-07 20:24:22 +00001270 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
1271}
1272
1273PyObject *PyUnicode_DecodeUTF8Stateful(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001274 Py_ssize_t size,
Walter Dörwald69652032004-09-07 20:24:22 +00001275 const char *errors,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001276 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00001277{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001278 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001279 int n;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001280 Py_ssize_t startinpos;
1281 Py_ssize_t endinpos;
1282 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001283 const char *e;
1284 PyUnicodeObject *unicode;
1285 Py_UNICODE *p;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001286 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001287 PyObject *errorHandler = NULL;
1288 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001289
1290 /* Note: size will always be longer than the resulting Unicode
1291 character count */
1292 unicode = _PyUnicode_New(size);
1293 if (!unicode)
1294 return NULL;
Walter Dörwald69652032004-09-07 20:24:22 +00001295 if (size == 0) {
1296 if (consumed)
1297 *consumed = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001298 return (PyObject *)unicode;
Walter Dörwald69652032004-09-07 20:24:22 +00001299 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001300
1301 /* Unpack UTF-8 encoded data */
1302 p = unicode->str;
1303 e = s + size;
1304
1305 while (s < e) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001306 Py_UCS4 ch = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001307
1308 if (ch < 0x80) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001309 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001310 s++;
1311 continue;
1312 }
1313
1314 n = utf8_code_length[ch];
1315
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001316 if (s + n > e) {
Walter Dörwald69652032004-09-07 20:24:22 +00001317 if (consumed)
1318 break;
1319 else {
1320 errmsg = "unexpected end of data";
1321 startinpos = s-starts;
1322 endinpos = size;
1323 goto utf8Error;
1324 }
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001325 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001326
1327 switch (n) {
1328
1329 case 0:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001330 errmsg = "unexpected code byte";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001331 startinpos = s-starts;
1332 endinpos = startinpos+1;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001333 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001334
1335 case 1:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001336 errmsg = "internal error";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001337 startinpos = s-starts;
1338 endinpos = startinpos+1;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001339 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001340
1341 case 2:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001342 if ((s[1] & 0xc0) != 0x80) {
1343 errmsg = "invalid data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001344 startinpos = s-starts;
1345 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001346 goto utf8Error;
1347 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001348 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001349 if (ch < 0x80) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001350 startinpos = s-starts;
1351 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001352 errmsg = "illegal encoding";
1353 goto utf8Error;
1354 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001355 else
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001356 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001357 break;
1358
1359 case 3:
Tim Petersced69f82003-09-16 20:30:58 +00001360 if ((s[1] & 0xc0) != 0x80 ||
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001361 (s[2] & 0xc0) != 0x80) {
1362 errmsg = "invalid data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001363 startinpos = s-starts;
1364 endinpos = startinpos+3;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001365 goto utf8Error;
1366 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001367 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001368 if (ch < 0x0800) {
1369 /* Note: UTF-8 encodings of surrogates are considered
Tim Petersced69f82003-09-16 20:30:58 +00001370 legal UTF-8 sequences;
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001371
1372 XXX For wide builds (UCS-4) we should probably try
1373 to recombine the surrogates into a single code
1374 unit.
1375 */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001376 errmsg = "illegal encoding";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001377 startinpos = s-starts;
1378 endinpos = startinpos+3;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001379 goto utf8Error;
1380 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001381 else
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001382 *p++ = (Py_UNICODE)ch;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001383 break;
1384
1385 case 4:
1386 if ((s[1] & 0xc0) != 0x80 ||
1387 (s[2] & 0xc0) != 0x80 ||
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001388 (s[3] & 0xc0) != 0x80) {
1389 errmsg = "invalid data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001390 startinpos = s-starts;
1391 endinpos = startinpos+4;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001392 goto utf8Error;
1393 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001394 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
1395 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
1396 /* validate and convert to UTF-16 */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001397 if ((ch < 0x10000) /* minimum value allowed for 4
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001398 byte encoding */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001399 || (ch > 0x10ffff)) /* maximum value allowed for
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001400 UTF-16 */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001401 {
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001402 errmsg = "illegal encoding";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001403 startinpos = s-starts;
1404 endinpos = startinpos+4;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001405 goto utf8Error;
1406 }
Fredrik Lundh8f455852001-06-27 18:59:43 +00001407#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001408 *p++ = (Py_UNICODE)ch;
1409#else
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001410 /* compute and append the two surrogates: */
Tim Petersced69f82003-09-16 20:30:58 +00001411
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001412 /* translate from 10000..10FFFF to 0..FFFF */
1413 ch -= 0x10000;
Tim Petersced69f82003-09-16 20:30:58 +00001414
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001415 /* high surrogate = top 10 bits added to D800 */
1416 *p++ = (Py_UNICODE)(0xD800 + (ch >> 10));
Tim Petersced69f82003-09-16 20:30:58 +00001417
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001418 /* low surrogate = bottom 10 bits added to DC00 */
Fredrik Lundh45714e92001-06-26 16:39:36 +00001419 *p++ = (Py_UNICODE)(0xDC00 + (ch & 0x03FF));
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001420#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00001421 break;
1422
1423 default:
1424 /* Other sizes are only needed for UCS-4 */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001425 errmsg = "unsupported Unicode code range";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001426 startinpos = s-starts;
1427 endinpos = startinpos+n;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001428 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001429 }
1430 s += n;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001431 continue;
Tim Petersced69f82003-09-16 20:30:58 +00001432
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001433 utf8Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001434 outpos = p-PyUnicode_AS_UNICODE(unicode);
1435 if (unicode_decode_call_errorhandler(
1436 errors, &errorHandler,
1437 "utf8", errmsg,
1438 starts, size, &startinpos, &endinpos, &exc, &s,
1439 (PyObject **)&unicode, &outpos, &p))
1440 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001441 }
Walter Dörwald69652032004-09-07 20:24:22 +00001442 if (consumed)
1443 *consumed = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001444
1445 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00001446 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001447 goto onError;
1448
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001449 Py_XDECREF(errorHandler);
1450 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001451 return (PyObject *)unicode;
1452
1453onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001454 Py_XDECREF(errorHandler);
1455 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001456 Py_DECREF(unicode);
1457 return NULL;
1458}
1459
Tim Peters602f7402002-04-27 18:03:26 +00001460/* Allocation strategy: if the string is short, convert into a stack buffer
1461 and allocate exactly as much space needed at the end. Else allocate the
1462 maximum possible needed (4 result bytes per Unicode character), and return
1463 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001464*/
Tim Peters7e3d9612002-04-21 03:26:37 +00001465PyObject *
1466PyUnicode_EncodeUTF8(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001467 Py_ssize_t size,
Tim Peters7e3d9612002-04-21 03:26:37 +00001468 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001469{
Tim Peters602f7402002-04-27 18:03:26 +00001470#define MAX_SHORT_UNICHARS 300 /* largest size we'll do on the stack */
Tim Peters0eca65c2002-04-21 17:28:06 +00001471
Martin v. Löwis18e16552006-02-15 17:27:45 +00001472 Py_ssize_t i; /* index into s of next input byte */
Tim Peters602f7402002-04-27 18:03:26 +00001473 PyObject *v; /* result string object */
1474 char *p; /* next free byte in output buffer */
Martin v. Löwis18e16552006-02-15 17:27:45 +00001475 Py_ssize_t nallocated; /* number of result bytes allocated */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001476 Py_ssize_t nneeded; /* number of result bytes needed */
Tim Peters602f7402002-04-27 18:03:26 +00001477 char stackbuf[MAX_SHORT_UNICHARS * 4];
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001478
Tim Peters602f7402002-04-27 18:03:26 +00001479 assert(s != NULL);
1480 assert(size >= 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001481
Tim Peters602f7402002-04-27 18:03:26 +00001482 if (size <= MAX_SHORT_UNICHARS) {
1483 /* Write into the stack buffer; nallocated can't overflow.
1484 * At the end, we'll allocate exactly as much heap space as it
1485 * turns out we need.
1486 */
1487 nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int);
1488 v = NULL; /* will allocate after we're done */
1489 p = stackbuf;
1490 }
1491 else {
1492 /* Overallocate on the heap, and give the excess back at the end. */
1493 nallocated = size * 4;
1494 if (nallocated / 4 != size) /* overflow! */
1495 return PyErr_NoMemory();
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001496 v = PyBytes_FromStringAndSize(NULL, nallocated);
Tim Peters602f7402002-04-27 18:03:26 +00001497 if (v == NULL)
1498 return NULL;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001499 p = PyBytes_AS_STRING(v);
Tim Peters602f7402002-04-27 18:03:26 +00001500 }
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001501
Tim Peters602f7402002-04-27 18:03:26 +00001502 for (i = 0; i < size;) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001503 Py_UCS4 ch = s[i++];
Marc-André Lemburg3688a882002-02-06 18:09:02 +00001504
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001505 if (ch < 0x80)
Tim Peters602f7402002-04-27 18:03:26 +00001506 /* Encode ASCII */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001507 *p++ = (char) ch;
Marc-André Lemburg3688a882002-02-06 18:09:02 +00001508
Guido van Rossumd57fd912000-03-10 22:53:23 +00001509 else if (ch < 0x0800) {
Tim Peters602f7402002-04-27 18:03:26 +00001510 /* Encode Latin-1 */
Marc-André Lemburgdc724d62002-02-06 18:20:19 +00001511 *p++ = (char)(0xc0 | (ch >> 6));
1512 *p++ = (char)(0x80 | (ch & 0x3f));
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001513 }
Marc-André Lemburg3688a882002-02-06 18:09:02 +00001514 else {
Tim Peters602f7402002-04-27 18:03:26 +00001515 /* Encode UCS2 Unicode ordinals */
1516 if (ch < 0x10000) {
1517 /* Special case: check for high surrogate */
1518 if (0xD800 <= ch && ch <= 0xDBFF && i != size) {
1519 Py_UCS4 ch2 = s[i];
1520 /* Check for low surrogate and combine the two to
1521 form a UCS4 value */
1522 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001523 ch = ((ch - 0xD800) << 10 | (ch2 - 0xDC00)) + 0x10000;
Tim Peters602f7402002-04-27 18:03:26 +00001524 i++;
1525 goto encodeUCS4;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001526 }
Tim Peters602f7402002-04-27 18:03:26 +00001527 /* Fall through: handles isolated high surrogates */
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001528 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001529 *p++ = (char)(0xe0 | (ch >> 12));
Tim Peters602f7402002-04-27 18:03:26 +00001530 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
1531 *p++ = (char)(0x80 | (ch & 0x3f));
1532 continue;
1533 }
1534encodeUCS4:
1535 /* Encode UCS4 Unicode ordinals */
1536 *p++ = (char)(0xf0 | (ch >> 18));
1537 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
1538 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
1539 *p++ = (char)(0x80 | (ch & 0x3f));
1540 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001541 }
Tim Peters0eca65c2002-04-21 17:28:06 +00001542
Tim Peters602f7402002-04-27 18:03:26 +00001543 if (v == NULL) {
1544 /* This was stack allocated. */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001545 nneeded = p - stackbuf;
Tim Peters602f7402002-04-27 18:03:26 +00001546 assert(nneeded <= nallocated);
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001547 v = PyBytes_FromStringAndSize(stackbuf, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00001548 }
1549 else {
1550 /* Cut back to size actually needed. */
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001551 nneeded = p - PyBytes_AS_STRING(v);
Tim Peters602f7402002-04-27 18:03:26 +00001552 assert(nneeded <= nallocated);
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001553 PyBytes_Resize(v, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00001554 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001555 return v;
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001556
Tim Peters602f7402002-04-27 18:03:26 +00001557#undef MAX_SHORT_UNICHARS
Guido van Rossumd57fd912000-03-10 22:53:23 +00001558}
1559
Guido van Rossumd57fd912000-03-10 22:53:23 +00001560PyObject *PyUnicode_AsUTF8String(PyObject *unicode)
1561{
Guido van Rossumd57fd912000-03-10 22:53:23 +00001562 if (!PyUnicode_Check(unicode)) {
1563 PyErr_BadArgument();
1564 return NULL;
1565 }
Barry Warsaw2dd4abf2000-08-18 06:58:15 +00001566 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
1567 PyUnicode_GET_SIZE(unicode),
1568 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001569}
1570
1571/* --- UTF-16 Codec ------------------------------------------------------- */
1572
Tim Peters772747b2001-08-09 22:21:55 +00001573PyObject *
1574PyUnicode_DecodeUTF16(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001575 Py_ssize_t size,
Tim Peters772747b2001-08-09 22:21:55 +00001576 const char *errors,
1577 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001578{
Walter Dörwald69652032004-09-07 20:24:22 +00001579 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
1580}
1581
1582PyObject *
1583PyUnicode_DecodeUTF16Stateful(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001584 Py_ssize_t size,
Walter Dörwald69652032004-09-07 20:24:22 +00001585 const char *errors,
1586 int *byteorder,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001587 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00001588{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001589 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001590 Py_ssize_t startinpos;
1591 Py_ssize_t endinpos;
1592 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001593 PyUnicodeObject *unicode;
1594 Py_UNICODE *p;
Tim Peters772747b2001-08-09 22:21:55 +00001595 const unsigned char *q, *e;
1596 int bo = 0; /* assume native ordering by default */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001597 const char *errmsg = "";
Tim Peters772747b2001-08-09 22:21:55 +00001598 /* Offsets from q for retrieving byte pairs in the right order. */
1599#ifdef BYTEORDER_IS_LITTLE_ENDIAN
1600 int ihi = 1, ilo = 0;
1601#else
1602 int ihi = 0, ilo = 1;
1603#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001604 PyObject *errorHandler = NULL;
1605 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001606
1607 /* Note: size will always be longer than the resulting Unicode
1608 character count */
1609 unicode = _PyUnicode_New(size);
1610 if (!unicode)
1611 return NULL;
1612 if (size == 0)
1613 return (PyObject *)unicode;
1614
1615 /* Unpack UTF-16 encoded data */
1616 p = unicode->str;
Tim Peters772747b2001-08-09 22:21:55 +00001617 q = (unsigned char *)s;
1618 e = q + size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001619
1620 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00001621 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001622
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001623 /* Check for BOM marks (U+FEFF) in the input and adjust current
1624 byte order setting accordingly. In native mode, the leading BOM
1625 mark is skipped, in all other modes, it is copied to the output
1626 stream as-is (giving a ZWNBSP character). */
1627 if (bo == 0) {
Walter Dörwald69652032004-09-07 20:24:22 +00001628 if (size >= 2) {
1629 const Py_UNICODE bom = (q[ihi] << 8) | q[ilo];
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001630#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Walter Dörwald69652032004-09-07 20:24:22 +00001631 if (bom == 0xFEFF) {
1632 q += 2;
1633 bo = -1;
1634 }
1635 else if (bom == 0xFFFE) {
1636 q += 2;
1637 bo = 1;
1638 }
Tim Petersced69f82003-09-16 20:30:58 +00001639#else
Walter Dörwald69652032004-09-07 20:24:22 +00001640 if (bom == 0xFEFF) {
1641 q += 2;
1642 bo = 1;
1643 }
1644 else if (bom == 0xFFFE) {
1645 q += 2;
1646 bo = -1;
1647 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001648#endif
Walter Dörwald69652032004-09-07 20:24:22 +00001649 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001650 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001651
Tim Peters772747b2001-08-09 22:21:55 +00001652 if (bo == -1) {
1653 /* force LE */
1654 ihi = 1;
1655 ilo = 0;
1656 }
1657 else if (bo == 1) {
1658 /* force BE */
1659 ihi = 0;
1660 ilo = 1;
1661 }
1662
1663 while (q < e) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001664 Py_UNICODE ch;
Walter Dörwald69652032004-09-07 20:24:22 +00001665 /* remaining bytes at the end? (size should be even) */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001666 if (e-q<2) {
Walter Dörwald69652032004-09-07 20:24:22 +00001667 if (consumed)
1668 break;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001669 errmsg = "truncated data";
1670 startinpos = ((const char *)q)-starts;
1671 endinpos = ((const char *)e)-starts;
1672 goto utf16Error;
1673 /* The remaining input chars are ignored if the callback
1674 chooses to skip the input */
1675 }
1676 ch = (q[ihi] << 8) | q[ilo];
1677
Tim Peters772747b2001-08-09 22:21:55 +00001678 q += 2;
1679
Guido van Rossumd57fd912000-03-10 22:53:23 +00001680 if (ch < 0xD800 || ch > 0xDFFF) {
1681 *p++ = ch;
1682 continue;
1683 }
1684
1685 /* UTF-16 code pair: */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001686 if (q >= e) {
1687 errmsg = "unexpected end of data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001688 startinpos = (((const char *)q)-2)-starts;
1689 endinpos = ((const char *)e)-starts;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001690 goto utf16Error;
1691 }
Martin v. Löwisac93bc22001-06-26 22:43:40 +00001692 if (0xD800 <= ch && ch <= 0xDBFF) {
Tim Peters772747b2001-08-09 22:21:55 +00001693 Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo];
1694 q += 2;
Martin v. Löwisac93bc22001-06-26 22:43:40 +00001695 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Fredrik Lundh8f455852001-06-27 18:59:43 +00001696#ifndef Py_UNICODE_WIDE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001697 *p++ = ch;
1698 *p++ = ch2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001699#else
1700 *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001701#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001702 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001703 }
1704 else {
1705 errmsg = "illegal UTF-16 surrogate";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001706 startinpos = (((const char *)q)-4)-starts;
1707 endinpos = startinpos+2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001708 goto utf16Error;
1709 }
1710
Guido van Rossumd57fd912000-03-10 22:53:23 +00001711 }
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001712 errmsg = "illegal encoding";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001713 startinpos = (((const char *)q)-2)-starts;
1714 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001715 /* Fall through to report the error */
1716
1717 utf16Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001718 outpos = p-PyUnicode_AS_UNICODE(unicode);
1719 if (unicode_decode_call_errorhandler(
1720 errors, &errorHandler,
1721 "utf16", errmsg,
1722 starts, size, &startinpos, &endinpos, &exc, (const char **)&q,
1723 (PyObject **)&unicode, &outpos, &p))
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001724 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001725 }
1726
1727 if (byteorder)
1728 *byteorder = bo;
1729
Walter Dörwald69652032004-09-07 20:24:22 +00001730 if (consumed)
1731 *consumed = (const char *)q-starts;
1732
Guido van Rossumd57fd912000-03-10 22:53:23 +00001733 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00001734 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001735 goto onError;
1736
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001737 Py_XDECREF(errorHandler);
1738 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001739 return (PyObject *)unicode;
1740
1741onError:
1742 Py_DECREF(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001743 Py_XDECREF(errorHandler);
1744 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001745 return NULL;
1746}
1747
Tim Peters772747b2001-08-09 22:21:55 +00001748PyObject *
1749PyUnicode_EncodeUTF16(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001750 Py_ssize_t size,
Tim Peters772747b2001-08-09 22:21:55 +00001751 const char *errors,
1752 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001753{
1754 PyObject *v;
Tim Peters772747b2001-08-09 22:21:55 +00001755 unsigned char *p;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00001756#ifdef Py_UNICODE_WIDE
Tim Peters772747b2001-08-09 22:21:55 +00001757 int i, pairs;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00001758#else
1759 const int pairs = 0;
1760#endif
Tim Peters772747b2001-08-09 22:21:55 +00001761 /* Offsets from p for storing byte pairs in the right order. */
1762#ifdef BYTEORDER_IS_LITTLE_ENDIAN
1763 int ihi = 1, ilo = 0;
1764#else
1765 int ihi = 0, ilo = 1;
1766#endif
1767
1768#define STORECHAR(CH) \
1769 do { \
1770 p[ihi] = ((CH) >> 8) & 0xff; \
1771 p[ilo] = (CH) & 0xff; \
1772 p += 2; \
1773 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001774
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00001775#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001776 for (i = pairs = 0; i < size; i++)
1777 if (s[i] >= 0x10000)
1778 pairs++;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00001779#endif
Walter Dörwald3cc34522007-05-04 10:48:27 +00001780 v = PyBytes_FromStringAndSize(NULL,
Tim Peters772747b2001-08-09 22:21:55 +00001781 2 * (size + pairs + (byteorder == 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +00001782 if (v == NULL)
1783 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001784
Walter Dörwald3cc34522007-05-04 10:48:27 +00001785 p = (unsigned char *)PyBytes_AS_STRING(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001786 if (byteorder == 0)
Tim Peters772747b2001-08-09 22:21:55 +00001787 STORECHAR(0xFEFF);
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00001788 if (size == 0)
Marc-André Lemburgb7520772000-08-14 11:29:19 +00001789 return v;
Tim Peters772747b2001-08-09 22:21:55 +00001790
1791 if (byteorder == -1) {
1792 /* force LE */
1793 ihi = 1;
1794 ilo = 0;
1795 }
1796 else if (byteorder == 1) {
1797 /* force BE */
1798 ihi = 0;
1799 ilo = 1;
1800 }
1801
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001802 while (size-- > 0) {
1803 Py_UNICODE ch = *s++;
1804 Py_UNICODE ch2 = 0;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00001805#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001806 if (ch >= 0x10000) {
Tim Peters772747b2001-08-09 22:21:55 +00001807 ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF);
1808 ch = 0xD800 | ((ch-0x10000) >> 10);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001809 }
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00001810#endif
Tim Peters772747b2001-08-09 22:21:55 +00001811 STORECHAR(ch);
1812 if (ch2)
1813 STORECHAR(ch2);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001814 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001815 return v;
Tim Peters772747b2001-08-09 22:21:55 +00001816#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00001817}
1818
1819PyObject *PyUnicode_AsUTF16String(PyObject *unicode)
1820{
1821 if (!PyUnicode_Check(unicode)) {
1822 PyErr_BadArgument();
1823 return NULL;
1824 }
1825 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
1826 PyUnicode_GET_SIZE(unicode),
1827 NULL,
1828 0);
1829}
1830
1831/* --- Unicode Escape Codec ----------------------------------------------- */
1832
Fredrik Lundh06d12682001-01-24 07:59:11 +00001833static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00001834
Guido van Rossumd57fd912000-03-10 22:53:23 +00001835PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001836 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00001837 const char *errors)
1838{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001839 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001840 Py_ssize_t startinpos;
1841 Py_ssize_t endinpos;
1842 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001843 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001844 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001845 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001846 const char *end;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001847 char* message;
1848 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001849 PyObject *errorHandler = NULL;
1850 PyObject *exc = NULL;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001851
Guido van Rossumd57fd912000-03-10 22:53:23 +00001852 /* Escaped strings will always be longer than the resulting
1853 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001854 length after conversion to the true value.
1855 (but if the error callback returns a long replacement string
1856 we'll have to allocate more space) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001857 v = _PyUnicode_New(size);
1858 if (v == NULL)
1859 goto onError;
1860 if (size == 0)
1861 return (PyObject *)v;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001862
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001863 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001864 end = s + size;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001865
Guido van Rossumd57fd912000-03-10 22:53:23 +00001866 while (s < end) {
1867 unsigned char c;
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00001868 Py_UNICODE x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001869 int digits;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001870
1871 /* Non-escape characters are interpreted as Unicode ordinals */
1872 if (*s != '\\') {
Fredrik Lundhccc74732001-02-18 22:13:49 +00001873 *p++ = (unsigned char) *s++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001874 continue;
1875 }
1876
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001877 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001878 /* \ - Escapes */
1879 s++;
1880 switch (*s++) {
1881
1882 /* \x escapes */
1883 case '\n': break;
1884 case '\\': *p++ = '\\'; break;
1885 case '\'': *p++ = '\''; break;
1886 case '\"': *p++ = '\"'; break;
1887 case 'b': *p++ = '\b'; break;
1888 case 'f': *p++ = '\014'; break; /* FF */
1889 case 't': *p++ = '\t'; break;
1890 case 'n': *p++ = '\n'; break;
1891 case 'r': *p++ = '\r'; break;
1892 case 'v': *p++ = '\013'; break; /* VT */
1893 case 'a': *p++ = '\007'; break; /* BEL, not classic C */
1894
1895 /* \OOO (octal) escapes */
1896 case '0': case '1': case '2': case '3':
1897 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001898 x = s[-1] - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00001899 if ('0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001900 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00001901 if ('0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001902 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00001903 }
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001904 *p++ = x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001905 break;
1906
Fredrik Lundhccc74732001-02-18 22:13:49 +00001907 /* hex escapes */
1908 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001909 case 'x':
Fredrik Lundhccc74732001-02-18 22:13:49 +00001910 digits = 2;
1911 message = "truncated \\xXX escape";
1912 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001913
Fredrik Lundhccc74732001-02-18 22:13:49 +00001914 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001915 case 'u':
Fredrik Lundhccc74732001-02-18 22:13:49 +00001916 digits = 4;
1917 message = "truncated \\uXXXX escape";
1918 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001919
Fredrik Lundhccc74732001-02-18 22:13:49 +00001920 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00001921 case 'U':
Fredrik Lundhccc74732001-02-18 22:13:49 +00001922 digits = 8;
1923 message = "truncated \\UXXXXXXXX escape";
1924 hexescape:
1925 chr = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001926 outpos = p-PyUnicode_AS_UNICODE(v);
1927 if (s+digits>end) {
1928 endinpos = size;
1929 if (unicode_decode_call_errorhandler(
1930 errors, &errorHandler,
1931 "unicodeescape", "end of string in escape sequence",
1932 starts, size, &startinpos, &endinpos, &exc, &s,
1933 (PyObject **)&v, &outpos, &p))
1934 goto onError;
1935 goto nextByte;
1936 }
1937 for (i = 0; i < digits; ++i) {
Fredrik Lundhccc74732001-02-18 22:13:49 +00001938 c = (unsigned char) s[i];
Fredrik Lundhdf846752000-09-03 11:29:49 +00001939 if (!isxdigit(c)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001940 endinpos = (s+i+1)-starts;
1941 if (unicode_decode_call_errorhandler(
1942 errors, &errorHandler,
1943 "unicodeescape", message,
1944 starts, size, &startinpos, &endinpos, &exc, &s,
1945 (PyObject **)&v, &outpos, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00001946 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001947 goto nextByte;
Fredrik Lundhdf846752000-09-03 11:29:49 +00001948 }
1949 chr = (chr<<4) & ~0xF;
1950 if (c >= '0' && c <= '9')
1951 chr += c - '0';
1952 else if (c >= 'a' && c <= 'f')
1953 chr += 10 + c - 'a';
1954 else
1955 chr += 10 + c - 'A';
1956 }
1957 s += i;
Jeremy Hylton504de6b2003-10-06 05:08:26 +00001958 if (chr == 0xffffffff && PyErr_Occurred())
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001959 /* _decoding_error will have already written into the
1960 target buffer. */
1961 break;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001962 store:
Fredrik Lundhdf846752000-09-03 11:29:49 +00001963 /* when we get here, chr is a 32-bit unicode character */
1964 if (chr <= 0xffff)
1965 /* UCS-2 character */
1966 *p++ = (Py_UNICODE) chr;
1967 else if (chr <= 0x10ffff) {
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001968 /* UCS-4 character. Either store directly, or as
Walter Dörwald8c077222002-03-25 11:16:18 +00001969 surrogate pair. */
Fredrik Lundh8f455852001-06-27 18:59:43 +00001970#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001971 *p++ = chr;
1972#else
Fredrik Lundhdf846752000-09-03 11:29:49 +00001973 chr -= 0x10000L;
1974 *p++ = 0xD800 + (Py_UNICODE) (chr >> 10);
Fredrik Lundh45714e92001-06-26 16:39:36 +00001975 *p++ = 0xDC00 + (Py_UNICODE) (chr & 0x03FF);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001976#endif
Fredrik Lundhdf846752000-09-03 11:29:49 +00001977 } else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001978 endinpos = s-starts;
1979 outpos = p-PyUnicode_AS_UNICODE(v);
1980 if (unicode_decode_call_errorhandler(
1981 errors, &errorHandler,
1982 "unicodeescape", "illegal Unicode character",
1983 starts, size, &startinpos, &endinpos, &exc, &s,
1984 (PyObject **)&v, &outpos, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00001985 goto onError;
1986 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00001987 break;
1988
1989 /* \N{name} */
1990 case 'N':
1991 message = "malformed \\N character escape";
1992 if (ucnhash_CAPI == NULL) {
1993 /* load the unicode data module */
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00001994 PyObject *m, *api;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001995 m = PyImport_ImportModule("unicodedata");
1996 if (m == NULL)
1997 goto ucnhashError;
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00001998 api = PyObject_GetAttrString(m, "ucnhash_CAPI");
Fredrik Lundhccc74732001-02-18 22:13:49 +00001999 Py_DECREF(m);
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00002000 if (api == NULL)
Fredrik Lundhccc74732001-02-18 22:13:49 +00002001 goto ucnhashError;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002002 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCObject_AsVoidPtr(api);
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00002003 Py_DECREF(api);
Fredrik Lundhccc74732001-02-18 22:13:49 +00002004 if (ucnhash_CAPI == NULL)
2005 goto ucnhashError;
2006 }
2007 if (*s == '{') {
2008 const char *start = s+1;
2009 /* look for the closing brace */
2010 while (*s != '}' && s < end)
2011 s++;
2012 if (s > start && s < end && *s == '}') {
2013 /* found a name. look it up in the unicode database */
2014 message = "unknown Unicode character name";
2015 s++;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +00002016 if (ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1), &chr))
Fredrik Lundhccc74732001-02-18 22:13:49 +00002017 goto store;
2018 }
2019 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002020 endinpos = s-starts;
2021 outpos = p-PyUnicode_AS_UNICODE(v);
2022 if (unicode_decode_call_errorhandler(
2023 errors, &errorHandler,
2024 "unicodeescape", message,
2025 starts, size, &startinpos, &endinpos, &exc, &s,
2026 (PyObject **)&v, &outpos, &p))
Fredrik Lundhccc74732001-02-18 22:13:49 +00002027 goto onError;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002028 break;
2029
2030 default:
Walter Dörwald8c077222002-03-25 11:16:18 +00002031 if (s > end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002032 message = "\\ at end of string";
2033 s--;
2034 endinpos = s-starts;
2035 outpos = p-PyUnicode_AS_UNICODE(v);
2036 if (unicode_decode_call_errorhandler(
2037 errors, &errorHandler,
2038 "unicodeescape", message,
2039 starts, size, &startinpos, &endinpos, &exc, &s,
2040 (PyObject **)&v, &outpos, &p))
Walter Dörwald8c077222002-03-25 11:16:18 +00002041 goto onError;
2042 }
2043 else {
2044 *p++ = '\\';
2045 *p++ = (unsigned char)s[-1];
2046 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00002047 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002048 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002049 nextByte:
2050 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002051 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002052 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002053 goto onError;
Walter Dörwaldd4ade082003-08-15 15:00:26 +00002054 Py_XDECREF(errorHandler);
2055 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002056 return (PyObject *)v;
Walter Dörwald8c077222002-03-25 11:16:18 +00002057
Fredrik Lundhccc74732001-02-18 22:13:49 +00002058ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00002059 PyErr_SetString(
2060 PyExc_UnicodeError,
2061 "\\N escapes not supported (can't load unicodedata module)"
2062 );
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00002063 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002064 Py_XDECREF(errorHandler);
2065 Py_XDECREF(exc);
Fredrik Lundhf6056062001-01-20 11:15:25 +00002066 return NULL;
2067
Fredrik Lundhccc74732001-02-18 22:13:49 +00002068onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00002069 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002070 Py_XDECREF(errorHandler);
2071 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002072 return NULL;
2073}
2074
2075/* Return a Unicode-Escape string version of the Unicode object.
2076
2077 If quotes is true, the string is enclosed in u"" or u'' quotes as
2078 appropriate.
2079
2080*/
2081
Thomas Wouters477c8d52006-05-27 19:21:47 +00002082Py_LOCAL_INLINE(const Py_UNICODE *) findchar(const Py_UNICODE *s,
2083 Py_ssize_t size,
2084 Py_UNICODE ch)
2085{
2086 /* like wcschr, but doesn't stop at NULL characters */
2087
2088 while (size-- > 0) {
2089 if (*s == ch)
2090 return s;
2091 s++;
2092 }
2093
2094 return NULL;
2095}
Barry Warsaw51ac5802000-03-20 16:36:48 +00002096
Walter Dörwald79e913e2007-05-12 11:08:06 +00002097static const char *hexdigits = "0123456789abcdef";
2098
2099PyObject *PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
2100 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002101{
2102 PyObject *repr;
2103 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002104
Thomas Wouters89f507f2006-12-13 04:49:30 +00002105 /* XXX(nnorwitz): rather than over-allocating, it would be
2106 better to choose a different scheme. Perhaps scan the
2107 first N-chars of the string and allocate based on that size.
2108 */
2109 /* Initial allocation is based on the longest-possible unichr
2110 escape.
2111
2112 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
2113 unichr, so in this case it's the longest unichr escape. In
2114 narrow (UTF-16) builds this is five chars per source unichr
2115 since there are two unichrs in the surrogate pair, so in narrow
2116 (UTF-16) builds it's not the longest unichr escape.
2117
2118 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
2119 so in the narrow (UTF-16) build case it's the longest unichr
2120 escape.
2121 */
2122
Walter Dörwald79e913e2007-05-12 11:08:06 +00002123 repr = PyBytes_FromStringAndSize(NULL,
Thomas Wouters89f507f2006-12-13 04:49:30 +00002124#ifdef Py_UNICODE_WIDE
2125 + 10*size
2126#else
2127 + 6*size
2128#endif
2129 + 1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002130 if (repr == NULL)
2131 return NULL;
2132
Walter Dörwald79e913e2007-05-12 11:08:06 +00002133 p = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002134
Guido van Rossumd57fd912000-03-10 22:53:23 +00002135 while (size-- > 0) {
2136 Py_UNICODE ch = *s++;
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002137
Walter Dörwald79e913e2007-05-12 11:08:06 +00002138 /* Escape backslashes */
2139 if (ch == '\\') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002140 *p++ = '\\';
2141 *p++ = (char) ch;
Walter Dörwald79e913e2007-05-12 11:08:06 +00002142 continue;
Tim Petersced69f82003-09-16 20:30:58 +00002143 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002144
Guido van Rossum0d42e0c2001-07-20 16:36:21 +00002145#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002146 /* Map 21-bit characters to '\U00xxxxxx' */
2147 else if (ch >= 0x10000) {
2148 *p++ = '\\';
2149 *p++ = 'U';
Walter Dörwald79e913e2007-05-12 11:08:06 +00002150 *p++ = hexdigits[(ch >> 28) & 0x0000000F];
2151 *p++ = hexdigits[(ch >> 24) & 0x0000000F];
2152 *p++ = hexdigits[(ch >> 20) & 0x0000000F];
2153 *p++ = hexdigits[(ch >> 16) & 0x0000000F];
2154 *p++ = hexdigits[(ch >> 12) & 0x0000000F];
2155 *p++ = hexdigits[(ch >> 8) & 0x0000000F];
2156 *p++ = hexdigits[(ch >> 4) & 0x0000000F];
2157 *p++ = hexdigits[ch & 0x0000000F];
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002158 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002159 }
Thomas Wouters89f507f2006-12-13 04:49:30 +00002160#else
2161 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002162 else if (ch >= 0xD800 && ch < 0xDC00) {
2163 Py_UNICODE ch2;
2164 Py_UCS4 ucs;
Tim Petersced69f82003-09-16 20:30:58 +00002165
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002166 ch2 = *s++;
2167 size--;
2168 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
2169 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
2170 *p++ = '\\';
2171 *p++ = 'U';
Walter Dörwald79e913e2007-05-12 11:08:06 +00002172 *p++ = hexdigits[(ucs >> 28) & 0x0000000F];
2173 *p++ = hexdigits[(ucs >> 24) & 0x0000000F];
2174 *p++ = hexdigits[(ucs >> 20) & 0x0000000F];
2175 *p++ = hexdigits[(ucs >> 16) & 0x0000000F];
2176 *p++ = hexdigits[(ucs >> 12) & 0x0000000F];
2177 *p++ = hexdigits[(ucs >> 8) & 0x0000000F];
2178 *p++ = hexdigits[(ucs >> 4) & 0x0000000F];
2179 *p++ = hexdigits[ucs & 0x0000000F];
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002180 continue;
2181 }
2182 /* Fall through: isolated surrogates are copied as-is */
2183 s--;
2184 size++;
2185 }
Thomas Wouters89f507f2006-12-13 04:49:30 +00002186#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002187
Guido van Rossumd57fd912000-03-10 22:53:23 +00002188 /* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002189 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002190 *p++ = '\\';
2191 *p++ = 'u';
Walter Dörwald79e913e2007-05-12 11:08:06 +00002192 *p++ = hexdigits[(ch >> 12) & 0x000F];
2193 *p++ = hexdigits[(ch >> 8) & 0x000F];
2194 *p++ = hexdigits[(ch >> 4) & 0x000F];
2195 *p++ = hexdigits[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00002196 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002197
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00002198 /* Map special whitespace to '\t', \n', '\r' */
2199 else if (ch == '\t') {
2200 *p++ = '\\';
2201 *p++ = 't';
2202 }
2203 else if (ch == '\n') {
2204 *p++ = '\\';
2205 *p++ = 'n';
2206 }
2207 else if (ch == '\r') {
2208 *p++ = '\\';
2209 *p++ = 'r';
2210 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002211
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00002212 /* Map non-printable US ASCII to '\xhh' */
Marc-André Lemburg11326de2001-11-28 12:56:20 +00002213 else if (ch < ' ' || ch >= 0x7F) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002214 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00002215 *p++ = 'x';
Walter Dörwald79e913e2007-05-12 11:08:06 +00002216 *p++ = hexdigits[(ch >> 4) & 0x000F];
2217 *p++ = hexdigits[ch & 0x000F];
Tim Petersced69f82003-09-16 20:30:58 +00002218 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002219
Guido van Rossumd57fd912000-03-10 22:53:23 +00002220 /* Copy everything else as-is */
2221 else
2222 *p++ = (char) ch;
2223 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002224
2225 *p = '\0';
Walter Dörwald79e913e2007-05-12 11:08:06 +00002226 if (PyBytes_Resize(repr, p - PyBytes_AS_STRING(repr))) {
2227 Py_DECREF(repr);
2228 return NULL;
2229 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002230 return repr;
2231}
2232
Guido van Rossumd57fd912000-03-10 22:53:23 +00002233PyObject *PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
2234{
Walter Dörwald79e913e2007-05-12 11:08:06 +00002235 PyObject *s, *result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002236 if (!PyUnicode_Check(unicode)) {
2237 PyErr_BadArgument();
2238 return NULL;
2239 }
Walter Dörwald79e913e2007-05-12 11:08:06 +00002240 s = PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
2241 PyUnicode_GET_SIZE(unicode));
2242
2243 if (!s)
2244 return NULL;
2245 result = PyString_FromStringAndSize(PyBytes_AS_STRING(s),
2246 PyBytes_GET_SIZE(s));
2247 Py_DECREF(s);
2248 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002249}
2250
2251/* --- Raw Unicode Escape Codec ------------------------------------------- */
2252
2253PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002254 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00002255 const char *errors)
2256{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002257 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002258 Py_ssize_t startinpos;
2259 Py_ssize_t endinpos;
2260 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002261 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002262 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002263 const char *end;
2264 const char *bs;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002265 PyObject *errorHandler = NULL;
2266 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00002267
Guido van Rossumd57fd912000-03-10 22:53:23 +00002268 /* Escaped strings will always be longer than the resulting
2269 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002270 length after conversion to the true value. (But decoding error
2271 handler might have to resize the string) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002272 v = _PyUnicode_New(size);
2273 if (v == NULL)
2274 goto onError;
2275 if (size == 0)
2276 return (PyObject *)v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002277 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002278 end = s + size;
2279 while (s < end) {
2280 unsigned char c;
Martin v. Löwis047c05e2002-03-21 08:55:28 +00002281 Py_UCS4 x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002282 int i;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002283 int count;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002284
2285 /* Non-escape characters are interpreted as Unicode ordinals */
2286 if (*s != '\\') {
2287 *p++ = (unsigned char)*s++;
2288 continue;
2289 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002290 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002291
2292 /* \u-escapes are only interpreted iff the number of leading
2293 backslashes if odd */
2294 bs = s;
2295 for (;s < end;) {
2296 if (*s != '\\')
2297 break;
2298 *p++ = (unsigned char)*s++;
2299 }
2300 if (((s - bs) & 1) == 0 ||
2301 s >= end ||
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002302 (*s != 'u' && *s != 'U')) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002303 continue;
2304 }
2305 p--;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002306 count = *s=='u' ? 4 : 8;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002307 s++;
2308
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002309 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002310 outpos = p-PyUnicode_AS_UNICODE(v);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002311 for (x = 0, i = 0; i < count; ++i, ++s) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002312 c = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002313 if (!isxdigit(c)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002314 endinpos = s-starts;
2315 if (unicode_decode_call_errorhandler(
2316 errors, &errorHandler,
2317 "rawunicodeescape", "truncated \\uXXXX",
2318 starts, size, &startinpos, &endinpos, &exc, &s,
2319 (PyObject **)&v, &outpos, &p))
Guido van Rossumd57fd912000-03-10 22:53:23 +00002320 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002321 goto nextByte;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002322 }
2323 x = (x<<4) & ~0xF;
2324 if (c >= '0' && c <= '9')
2325 x += c - '0';
2326 else if (c >= 'a' && c <= 'f')
2327 x += 10 + c - 'a';
2328 else
2329 x += 10 + c - 'A';
2330 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002331#ifndef Py_UNICODE_WIDE
2332 if (x > 0x10000) {
2333 if (unicode_decode_call_errorhandler(
2334 errors, &errorHandler,
2335 "rawunicodeescape", "\\Uxxxxxxxx out of range",
2336 starts, size, &startinpos, &endinpos, &exc, &s,
2337 (PyObject **)&v, &outpos, &p))
2338 goto onError;
2339 }
2340#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002341 *p++ = x;
2342 nextByte:
2343 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002344 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002345 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002346 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002347 Py_XDECREF(errorHandler);
2348 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002349 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00002350
Guido van Rossumd57fd912000-03-10 22:53:23 +00002351 onError:
2352 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002353 Py_XDECREF(errorHandler);
2354 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002355 return NULL;
2356}
2357
2358PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002359 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002360{
2361 PyObject *repr;
2362 char *p;
2363 char *q;
2364
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002365#ifdef Py_UNICODE_WIDE
Walter Dörwald711005d2007-05-12 12:03:26 +00002366 repr = PyBytes_FromStringAndSize(NULL, 10 * size);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002367#else
Walter Dörwald711005d2007-05-12 12:03:26 +00002368 repr = PyBytes_FromStringAndSize(NULL, 6 * size);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002369#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00002370 if (repr == NULL)
2371 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00002372 if (size == 0)
2373 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002374
Walter Dörwald711005d2007-05-12 12:03:26 +00002375 p = q = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002376 while (size-- > 0) {
2377 Py_UNICODE ch = *s++;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002378#ifdef Py_UNICODE_WIDE
2379 /* Map 32-bit characters to '\Uxxxxxxxx' */
2380 if (ch >= 0x10000) {
2381 *p++ = '\\';
2382 *p++ = 'U';
Walter Dörwalddb5d33e2007-05-12 11:13:47 +00002383 *p++ = hexdigits[(ch >> 28) & 0xf];
2384 *p++ = hexdigits[(ch >> 24) & 0xf];
2385 *p++ = hexdigits[(ch >> 20) & 0xf];
2386 *p++ = hexdigits[(ch >> 16) & 0xf];
2387 *p++ = hexdigits[(ch >> 12) & 0xf];
2388 *p++ = hexdigits[(ch >> 8) & 0xf];
2389 *p++ = hexdigits[(ch >> 4) & 0xf];
2390 *p++ = hexdigits[ch & 15];
Tim Petersced69f82003-09-16 20:30:58 +00002391 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002392 else
2393#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00002394 /* Map 16-bit characters to '\uxxxx' */
2395 if (ch >= 256) {
2396 *p++ = '\\';
2397 *p++ = 'u';
Walter Dörwalddb5d33e2007-05-12 11:13:47 +00002398 *p++ = hexdigits[(ch >> 12) & 0xf];
2399 *p++ = hexdigits[(ch >> 8) & 0xf];
2400 *p++ = hexdigits[(ch >> 4) & 0xf];
2401 *p++ = hexdigits[ch & 15];
Guido van Rossumd57fd912000-03-10 22:53:23 +00002402 }
2403 /* Copy everything else as-is */
2404 else
2405 *p++ = (char) ch;
2406 }
2407 *p = '\0';
Walter Dörwald711005d2007-05-12 12:03:26 +00002408 if (PyBytes_Resize(repr, p - q)) {
2409 Py_DECREF(repr);
2410 return NULL;
2411 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002412 return repr;
2413}
2414
2415PyObject *PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
2416{
Walter Dörwald711005d2007-05-12 12:03:26 +00002417 PyObject *s, *result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002418 if (!PyUnicode_Check(unicode)) {
Walter Dörwald711005d2007-05-12 12:03:26 +00002419 PyErr_BadArgument();
2420 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002421 }
Walter Dörwald711005d2007-05-12 12:03:26 +00002422 s = PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
2423 PyUnicode_GET_SIZE(unicode));
2424
2425 if (!s)
2426 return NULL;
2427 result = PyString_FromStringAndSize(PyBytes_AS_STRING(s),
2428 PyBytes_GET_SIZE(s));
2429 Py_DECREF(s);
2430 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002431}
2432
Walter Dörwalda47d1c02005-08-30 10:23:14 +00002433/* --- Unicode Internal Codec ------------------------------------------- */
2434
2435PyObject *_PyUnicode_DecodeUnicodeInternal(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002436 Py_ssize_t size,
Walter Dörwalda47d1c02005-08-30 10:23:14 +00002437 const char *errors)
2438{
2439 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002440 Py_ssize_t startinpos;
2441 Py_ssize_t endinpos;
2442 Py_ssize_t outpos;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00002443 PyUnicodeObject *v;
2444 Py_UNICODE *p;
2445 const char *end;
2446 const char *reason;
2447 PyObject *errorHandler = NULL;
2448 PyObject *exc = NULL;
2449
Neal Norwitzd43069c2006-01-08 01:12:10 +00002450#ifdef Py_UNICODE_WIDE
2451 Py_UNICODE unimax = PyUnicode_GetMax();
2452#endif
2453
Thomas Wouters89f507f2006-12-13 04:49:30 +00002454 /* XXX overflow detection missing */
Walter Dörwalda47d1c02005-08-30 10:23:14 +00002455 v = _PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE);
2456 if (v == NULL)
2457 goto onError;
2458 if (PyUnicode_GetSize((PyObject *)v) == 0)
2459 return (PyObject *)v;
2460 p = PyUnicode_AS_UNICODE(v);
2461 end = s + size;
2462
2463 while (s < end) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00002464 memcpy(p, s, sizeof(Py_UNICODE));
Walter Dörwalda47d1c02005-08-30 10:23:14 +00002465 /* We have to sanity check the raw data, otherwise doom looms for
2466 some malformed UCS-4 data. */
2467 if (
2468 #ifdef Py_UNICODE_WIDE
2469 *p > unimax || *p < 0 ||
2470 #endif
2471 end-s < Py_UNICODE_SIZE
2472 )
2473 {
2474 startinpos = s - starts;
2475 if (end-s < Py_UNICODE_SIZE) {
2476 endinpos = end-starts;
2477 reason = "truncated input";
2478 }
2479 else {
2480 endinpos = s - starts + Py_UNICODE_SIZE;
2481 reason = "illegal code point (> 0x10FFFF)";
2482 }
2483 outpos = p - PyUnicode_AS_UNICODE(v);
2484 if (unicode_decode_call_errorhandler(
2485 errors, &errorHandler,
2486 "unicode_internal", reason,
2487 starts, size, &startinpos, &endinpos, &exc, &s,
2488 (PyObject **)&v, &outpos, &p)) {
2489 goto onError;
2490 }
2491 }
2492 else {
2493 p++;
2494 s += Py_UNICODE_SIZE;
2495 }
2496 }
2497
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002498 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00002499 goto onError;
2500 Py_XDECREF(errorHandler);
2501 Py_XDECREF(exc);
2502 return (PyObject *)v;
2503
2504 onError:
2505 Py_XDECREF(v);
2506 Py_XDECREF(errorHandler);
2507 Py_XDECREF(exc);
2508 return NULL;
2509}
2510
Guido van Rossumd57fd912000-03-10 22:53:23 +00002511/* --- Latin-1 Codec ------------------------------------------------------ */
2512
2513PyObject *PyUnicode_DecodeLatin1(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002514 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00002515 const char *errors)
2516{
2517 PyUnicodeObject *v;
2518 Py_UNICODE *p;
Tim Petersced69f82003-09-16 20:30:58 +00002519
Guido van Rossumd57fd912000-03-10 22:53:23 +00002520 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002521 if (size == 1) {
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002522 Py_UNICODE r = *(unsigned char*)s;
2523 return PyUnicode_FromUnicode(&r, 1);
2524 }
2525
Guido van Rossumd57fd912000-03-10 22:53:23 +00002526 v = _PyUnicode_New(size);
2527 if (v == NULL)
2528 goto onError;
2529 if (size == 0)
2530 return (PyObject *)v;
2531 p = PyUnicode_AS_UNICODE(v);
2532 while (size-- > 0)
2533 *p++ = (unsigned char)*s++;
2534 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00002535
Guido van Rossumd57fd912000-03-10 22:53:23 +00002536 onError:
2537 Py_XDECREF(v);
2538 return NULL;
2539}
2540
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002541/* create or adjust a UnicodeEncodeError */
2542static void make_encode_exception(PyObject **exceptionObject,
2543 const char *encoding,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002544 const Py_UNICODE *unicode, Py_ssize_t size,
2545 Py_ssize_t startpos, Py_ssize_t endpos,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002546 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002547{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002548 if (*exceptionObject == NULL) {
2549 *exceptionObject = PyUnicodeEncodeError_Create(
2550 encoding, unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002551 }
2552 else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002553 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
2554 goto onError;
2555 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
2556 goto onError;
2557 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
2558 goto onError;
2559 return;
2560 onError:
2561 Py_DECREF(*exceptionObject);
2562 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002563 }
2564}
2565
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002566/* raises a UnicodeEncodeError */
2567static void raise_encode_exception(PyObject **exceptionObject,
2568 const char *encoding,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002569 const Py_UNICODE *unicode, Py_ssize_t size,
2570 Py_ssize_t startpos, Py_ssize_t endpos,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002571 const char *reason)
2572{
2573 make_encode_exception(exceptionObject,
2574 encoding, unicode, size, startpos, endpos, reason);
2575 if (*exceptionObject != NULL)
2576 PyCodec_StrictErrors(*exceptionObject);
2577}
2578
2579/* error handling callback helper:
2580 build arguments, call the callback and check the arguments,
2581 put the result into newpos and return the replacement string, which
2582 has to be freed by the caller */
2583static PyObject *unicode_encode_call_errorhandler(const char *errors,
2584 PyObject **errorHandler,
2585 const char *encoding, const char *reason,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002586 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
2587 Py_ssize_t startpos, Py_ssize_t endpos,
2588 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002589{
Martin v. Löwis18e16552006-02-15 17:27:45 +00002590 static char *argparse = "O!n;encoding error handler must return (unicode, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002591
2592 PyObject *restuple;
2593 PyObject *resunicode;
2594
2595 if (*errorHandler == NULL) {
2596 *errorHandler = PyCodec_LookupError(errors);
2597 if (*errorHandler == NULL)
2598 return NULL;
2599 }
2600
2601 make_encode_exception(exceptionObject,
2602 encoding, unicode, size, startpos, endpos, reason);
2603 if (*exceptionObject == NULL)
2604 return NULL;
2605
2606 restuple = PyObject_CallFunctionObjArgs(
2607 *errorHandler, *exceptionObject, NULL);
2608 if (restuple == NULL)
2609 return NULL;
2610 if (!PyTuple_Check(restuple)) {
2611 PyErr_Format(PyExc_TypeError, &argparse[4]);
2612 Py_DECREF(restuple);
2613 return NULL;
2614 }
2615 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
2616 &resunicode, newpos)) {
2617 Py_DECREF(restuple);
2618 return NULL;
2619 }
2620 if (*newpos<0)
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00002621 *newpos = size+*newpos;
2622 if (*newpos<0 || *newpos>size) {
Martin v. Löwis2c95cc62006-02-16 06:54:25 +00002623 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00002624 Py_DECREF(restuple);
2625 return NULL;
2626 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002627 Py_INCREF(resunicode);
2628 Py_DECREF(restuple);
2629 return resunicode;
2630}
2631
2632static PyObject *unicode_encode_ucs1(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002633 Py_ssize_t size,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002634 const char *errors,
2635 int limit)
2636{
2637 /* output object */
2638 PyObject *res;
2639 /* pointers to the beginning and end+1 of input */
2640 const Py_UNICODE *startp = p;
2641 const Py_UNICODE *endp = p + size;
2642 /* pointer to the beginning of the unencodable characters */
2643 /* const Py_UNICODE *badp = NULL; */
2644 /* pointer into the output */
2645 char *str;
2646 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00002647 Py_ssize_t respos = 0;
2648 Py_ssize_t ressize;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002649 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
2650 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002651 PyObject *errorHandler = NULL;
2652 PyObject *exc = NULL;
2653 /* the following variable is used for caching string comparisons
2654 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
2655 int known_errorHandler = -1;
2656
2657 /* allocate enough for a simple encoding without
2658 replacements, if we need more, we'll resize */
Guido van Rossumf15a29f2007-05-04 00:41:39 +00002659 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002660 if (res == NULL)
2661 goto onError;
2662 if (size == 0)
2663 return res;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00002664 str = PyBytes_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002665 ressize = size;
2666
2667 while (p<endp) {
2668 Py_UNICODE c = *p;
2669
2670 /* can we encode this? */
2671 if (c<limit) {
2672 /* no overflow check, because we know that the space is enough */
2673 *str++ = (char)c;
2674 ++p;
2675 }
2676 else {
Martin v. Löwis18e16552006-02-15 17:27:45 +00002677 Py_ssize_t unicodepos = p-startp;
2678 Py_ssize_t requiredsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002679 PyObject *repunicode;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002680 Py_ssize_t repsize;
2681 Py_ssize_t newpos;
2682 Py_ssize_t respos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002683 Py_UNICODE *uni2;
2684 /* startpos for collecting unencodable chars */
2685 const Py_UNICODE *collstart = p;
2686 const Py_UNICODE *collend = p;
2687 /* find all unecodable characters */
2688 while ((collend < endp) && ((*collend)>=limit))
2689 ++collend;
2690 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
2691 if (known_errorHandler==-1) {
2692 if ((errors==NULL) || (!strcmp(errors, "strict")))
2693 known_errorHandler = 1;
2694 else if (!strcmp(errors, "replace"))
2695 known_errorHandler = 2;
2696 else if (!strcmp(errors, "ignore"))
2697 known_errorHandler = 3;
2698 else if (!strcmp(errors, "xmlcharrefreplace"))
2699 known_errorHandler = 4;
2700 else
2701 known_errorHandler = 0;
2702 }
2703 switch (known_errorHandler) {
2704 case 1: /* strict */
2705 raise_encode_exception(&exc, encoding, startp, size, collstart-startp, collend-startp, reason);
2706 goto onError;
2707 case 2: /* replace */
2708 while (collstart++<collend)
2709 *str++ = '?'; /* fall through */
2710 case 3: /* ignore */
2711 p = collend;
2712 break;
2713 case 4: /* xmlcharrefreplace */
Guido van Rossumf15a29f2007-05-04 00:41:39 +00002714 respos = str - PyBytes_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002715 /* determine replacement size (temporarily (mis)uses p) */
2716 for (p = collstart, repsize = 0; p < collend; ++p) {
2717 if (*p<10)
2718 repsize += 2+1+1;
2719 else if (*p<100)
2720 repsize += 2+2+1;
2721 else if (*p<1000)
2722 repsize += 2+3+1;
2723 else if (*p<10000)
2724 repsize += 2+4+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00002725#ifndef Py_UNICODE_WIDE
2726 else
2727 repsize += 2+5+1;
2728#else
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002729 else if (*p<100000)
2730 repsize += 2+5+1;
2731 else if (*p<1000000)
2732 repsize += 2+6+1;
2733 else
2734 repsize += 2+7+1;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002735#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002736 }
2737 requiredsize = respos+repsize+(endp-collend);
2738 if (requiredsize > ressize) {
2739 if (requiredsize<2*ressize)
2740 requiredsize = 2*ressize;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00002741 if (PyBytes_Resize(res, requiredsize))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002742 goto onError;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00002743 str = PyBytes_AS_STRING(res) + respos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002744 ressize = requiredsize;
2745 }
2746 /* generate replacement (temporarily (mis)uses p) */
2747 for (p = collstart; p < collend; ++p) {
2748 str += sprintf(str, "&#%d;", (int)*p);
2749 }
2750 p = collend;
2751 break;
2752 default:
2753 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
2754 encoding, reason, startp, size, &exc,
2755 collstart-startp, collend-startp, &newpos);
2756 if (repunicode == NULL)
2757 goto onError;
2758 /* need more space? (at least enough for what we
2759 have+the replacement+the rest of the string, so
2760 we won't have to check space for encodable characters) */
Guido van Rossumf15a29f2007-05-04 00:41:39 +00002761 respos = str - PyBytes_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002762 repsize = PyUnicode_GET_SIZE(repunicode);
2763 requiredsize = respos+repsize+(endp-collend);
2764 if (requiredsize > ressize) {
2765 if (requiredsize<2*ressize)
2766 requiredsize = 2*ressize;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00002767 if (PyBytes_Resize(res, requiredsize)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002768 Py_DECREF(repunicode);
2769 goto onError;
2770 }
Guido van Rossumf15a29f2007-05-04 00:41:39 +00002771 str = PyBytes_AS_STRING(res) + respos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002772 ressize = requiredsize;
2773 }
2774 /* check if there is anything unencodable in the replacement
2775 and copy it to the output */
2776 for (uni2 = PyUnicode_AS_UNICODE(repunicode);repsize-->0; ++uni2, ++str) {
2777 c = *uni2;
2778 if (c >= limit) {
2779 raise_encode_exception(&exc, encoding, startp, size,
2780 unicodepos, unicodepos+1, reason);
2781 Py_DECREF(repunicode);
2782 goto onError;
2783 }
2784 *str = (char)c;
2785 }
2786 p = startp + newpos;
2787 Py_DECREF(repunicode);
2788 }
2789 }
2790 }
2791 /* Resize if we allocated to much */
Guido van Rossumf15a29f2007-05-04 00:41:39 +00002792 respos = str - PyBytes_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002793 if (respos<ressize)
2794 /* If this falls res will be NULL */
Guido van Rossumf15a29f2007-05-04 00:41:39 +00002795 PyBytes_Resize(res, respos);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002796 Py_XDECREF(errorHandler);
2797 Py_XDECREF(exc);
2798 return res;
2799
2800 onError:
2801 Py_XDECREF(res);
2802 Py_XDECREF(errorHandler);
2803 Py_XDECREF(exc);
2804 return NULL;
2805}
2806
Guido van Rossumd57fd912000-03-10 22:53:23 +00002807PyObject *PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002808 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00002809 const char *errors)
2810{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002811 return unicode_encode_ucs1(p, size, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002812}
2813
2814PyObject *PyUnicode_AsLatin1String(PyObject *unicode)
2815{
2816 if (!PyUnicode_Check(unicode)) {
2817 PyErr_BadArgument();
2818 return NULL;
2819 }
2820 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
2821 PyUnicode_GET_SIZE(unicode),
2822 NULL);
2823}
2824
2825/* --- 7-bit ASCII Codec -------------------------------------------------- */
2826
Guido van Rossumd57fd912000-03-10 22:53:23 +00002827PyObject *PyUnicode_DecodeASCII(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002828 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00002829 const char *errors)
2830{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002831 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002832 PyUnicodeObject *v;
2833 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002834 Py_ssize_t startinpos;
2835 Py_ssize_t endinpos;
2836 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002837 const char *e;
2838 PyObject *errorHandler = NULL;
2839 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00002840
Guido van Rossumd57fd912000-03-10 22:53:23 +00002841 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002842 if (size == 1 && *(unsigned char*)s < 128) {
2843 Py_UNICODE r = *(unsigned char*)s;
2844 return PyUnicode_FromUnicode(&r, 1);
2845 }
Tim Petersced69f82003-09-16 20:30:58 +00002846
Guido van Rossumd57fd912000-03-10 22:53:23 +00002847 v = _PyUnicode_New(size);
2848 if (v == NULL)
2849 goto onError;
2850 if (size == 0)
2851 return (PyObject *)v;
2852 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002853 e = s + size;
2854 while (s < e) {
2855 register unsigned char c = (unsigned char)*s;
2856 if (c < 128) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002857 *p++ = c;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002858 ++s;
2859 }
2860 else {
2861 startinpos = s-starts;
2862 endinpos = startinpos + 1;
Jeremy Hyltond8082792003-09-16 19:41:39 +00002863 outpos = p - (Py_UNICODE *)PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002864 if (unicode_decode_call_errorhandler(
2865 errors, &errorHandler,
2866 "ascii", "ordinal not in range(128)",
2867 starts, size, &startinpos, &endinpos, &exc, &s,
2868 (PyObject **)&v, &outpos, &p))
Guido van Rossumd57fd912000-03-10 22:53:23 +00002869 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002870 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002871 }
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002872 if (p - PyUnicode_AS_UNICODE(v) < PyString_GET_SIZE(v))
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002873 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002874 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002875 Py_XDECREF(errorHandler);
2876 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002877 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00002878
Guido van Rossumd57fd912000-03-10 22:53:23 +00002879 onError:
2880 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002881 Py_XDECREF(errorHandler);
2882 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002883 return NULL;
2884}
2885
Guido van Rossumd57fd912000-03-10 22:53:23 +00002886PyObject *PyUnicode_EncodeASCII(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002887 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00002888 const char *errors)
2889{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002890 return unicode_encode_ucs1(p, size, errors, 128);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002891}
2892
2893PyObject *PyUnicode_AsASCIIString(PyObject *unicode)
2894{
2895 if (!PyUnicode_Check(unicode)) {
2896 PyErr_BadArgument();
2897 return NULL;
2898 }
2899 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
2900 PyUnicode_GET_SIZE(unicode),
2901 NULL);
2902}
2903
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00002904#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Guido van Rossum2ea3e142000-03-31 17:24:09 +00002905
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002906/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00002907
Thomas Wouters0e3f5912006-08-11 14:57:12 +00002908#if SIZEOF_INT < SIZEOF_SSIZE_T
2909#define NEED_RETRY
2910#endif
2911
2912/* XXX This code is limited to "true" double-byte encodings, as
2913 a) it assumes an incomplete character consists of a single byte, and
2914 b) IsDBCSLeadByte (probably) does not work for non-DBCS multi-byte
2915 encodings, see IsDBCSLeadByteEx documentation. */
2916
2917static int is_dbcs_lead_byte(const char *s, int offset)
2918{
2919 const char *curr = s + offset;
2920
2921 if (IsDBCSLeadByte(*curr)) {
2922 const char *prev = CharPrev(s, curr);
2923 return (prev == curr) || !IsDBCSLeadByte(*prev) || (curr - prev == 2);
2924 }
2925 return 0;
2926}
2927
2928/*
2929 * Decode MBCS string into unicode object. If 'final' is set, converts
2930 * trailing lead-byte too. Returns consumed size if succeed, -1 otherwise.
2931 */
2932static int decode_mbcs(PyUnicodeObject **v,
2933 const char *s, /* MBCS string */
2934 int size, /* sizeof MBCS string */
2935 int final)
2936{
2937 Py_UNICODE *p;
2938 Py_ssize_t n = 0;
2939 int usize = 0;
2940
2941 assert(size >= 0);
2942
2943 /* Skip trailing lead-byte unless 'final' is set */
2944 if (!final && size >= 1 && is_dbcs_lead_byte(s, size - 1))
2945 --size;
2946
2947 /* First get the size of the result */
2948 if (size > 0) {
2949 usize = MultiByteToWideChar(CP_ACP, 0, s, size, NULL, 0);
2950 if (usize == 0) {
2951 PyErr_SetFromWindowsErrWithFilename(0, NULL);
2952 return -1;
2953 }
2954 }
2955
2956 if (*v == NULL) {
2957 /* Create unicode object */
2958 *v = _PyUnicode_New(usize);
2959 if (*v == NULL)
2960 return -1;
2961 }
2962 else {
2963 /* Extend unicode object */
2964 n = PyUnicode_GET_SIZE(*v);
2965 if (_PyUnicode_Resize(v, n + usize) < 0)
2966 return -1;
2967 }
2968
2969 /* Do the conversion */
2970 if (size > 0) {
2971 p = PyUnicode_AS_UNICODE(*v) + n;
2972 if (0 == MultiByteToWideChar(CP_ACP, 0, s, size, p, usize)) {
2973 PyErr_SetFromWindowsErrWithFilename(0, NULL);
2974 return -1;
2975 }
2976 }
2977
2978 return size;
2979}
2980
2981PyObject *PyUnicode_DecodeMBCSStateful(const char *s,
2982 Py_ssize_t size,
2983 const char *errors,
2984 Py_ssize_t *consumed)
2985{
2986 PyUnicodeObject *v = NULL;
2987 int done;
2988
2989 if (consumed)
2990 *consumed = 0;
2991
2992#ifdef NEED_RETRY
2993 retry:
2994 if (size > INT_MAX)
2995 done = decode_mbcs(&v, s, INT_MAX, 0);
2996 else
2997#endif
2998 done = decode_mbcs(&v, s, (int)size, !consumed);
2999
3000 if (done < 0) {
3001 Py_XDECREF(v);
3002 return NULL;
3003 }
3004
3005 if (consumed)
3006 *consumed += done;
3007
3008#ifdef NEED_RETRY
3009 if (size > INT_MAX) {
3010 s += done;
3011 size -= done;
3012 goto retry;
3013 }
3014#endif
3015
3016 return (PyObject *)v;
3017}
3018
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003019PyObject *PyUnicode_DecodeMBCS(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003020 Py_ssize_t size,
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003021 const char *errors)
3022{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003023 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
3024}
3025
3026/*
3027 * Convert unicode into string object (MBCS).
3028 * Returns 0 if succeed, -1 otherwise.
3029 */
3030static int encode_mbcs(PyObject **repr,
3031 const Py_UNICODE *p, /* unicode */
3032 int size) /* size of unicode */
3033{
3034 int mbcssize = 0;
3035 Py_ssize_t n = 0;
3036
3037 assert(size >= 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003038
3039 /* First get the size of the result */
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003040 if (size > 0) {
3041 mbcssize = WideCharToMultiByte(CP_ACP, 0, p, size, NULL, 0, NULL, NULL);
3042 if (mbcssize == 0) {
3043 PyErr_SetFromWindowsErrWithFilename(0, NULL);
3044 return -1;
3045 }
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003046 }
3047
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003048 if (*repr == NULL) {
3049 /* Create string object */
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003050 *repr = PyBytes_FromStringAndSize(NULL, mbcssize);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003051 if (*repr == NULL)
3052 return -1;
3053 }
3054 else {
3055 /* Extend string object */
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003056 n = PyBytes_Size(*repr);
3057 if (PyBytes_Resize(*repr, n + mbcssize) < 0)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003058 return -1;
3059 }
3060
3061 /* Do the conversion */
3062 if (size > 0) {
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003063 char *s = PyBytes_AS_STRING(*repr) + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003064 if (0 == WideCharToMultiByte(CP_ACP, 0, p, size, s, mbcssize, NULL, NULL)) {
3065 PyErr_SetFromWindowsErrWithFilename(0, NULL);
3066 return -1;
3067 }
3068 }
3069
3070 return 0;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003071}
3072
3073PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003074 Py_ssize_t size,
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003075 const char *errors)
3076{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003077 PyObject *repr = NULL;
3078 int ret;
Guido van Rossum03e29f12000-05-04 15:52:20 +00003079
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003080#ifdef NEED_RETRY
3081 retry:
3082 if (size > INT_MAX)
3083 ret = encode_mbcs(&repr, p, INT_MAX);
3084 else
3085#endif
3086 ret = encode_mbcs(&repr, p, (int)size);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003087
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003088 if (ret < 0) {
3089 Py_XDECREF(repr);
3090 return NULL;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003091 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003092
3093#ifdef NEED_RETRY
3094 if (size > INT_MAX) {
3095 p += INT_MAX;
3096 size -= INT_MAX;
3097 goto retry;
3098 }
3099#endif
3100
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003101 return repr;
3102}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00003103
Mark Hammond0ccda1e2003-07-01 00:13:27 +00003104PyObject *PyUnicode_AsMBCSString(PyObject *unicode)
3105{
3106 if (!PyUnicode_Check(unicode)) {
3107 PyErr_BadArgument();
3108 return NULL;
3109 }
3110 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
3111 PyUnicode_GET_SIZE(unicode),
3112 NULL);
3113}
3114
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003115#undef NEED_RETRY
3116
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00003117#endif /* MS_WINDOWS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003118
Guido van Rossumd57fd912000-03-10 22:53:23 +00003119/* --- Character Mapping Codec -------------------------------------------- */
3120
Guido van Rossumd57fd912000-03-10 22:53:23 +00003121PyObject *PyUnicode_DecodeCharmap(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003122 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003123 PyObject *mapping,
3124 const char *errors)
3125{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003126 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003127 Py_ssize_t startinpos;
3128 Py_ssize_t endinpos;
3129 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003130 const char *e;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003131 PyUnicodeObject *v;
3132 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003133 Py_ssize_t extrachars = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003134 PyObject *errorHandler = NULL;
3135 PyObject *exc = NULL;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003136 Py_UNICODE *mapstring = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003137 Py_ssize_t maplen = 0;
Tim Petersced69f82003-09-16 20:30:58 +00003138
Guido van Rossumd57fd912000-03-10 22:53:23 +00003139 /* Default to Latin-1 */
3140 if (mapping == NULL)
3141 return PyUnicode_DecodeLatin1(s, size, errors);
3142
3143 v = _PyUnicode_New(size);
3144 if (v == NULL)
3145 goto onError;
3146 if (size == 0)
3147 return (PyObject *)v;
3148 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003149 e = s + size;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003150 if (PyUnicode_CheckExact(mapping)) {
3151 mapstring = PyUnicode_AS_UNICODE(mapping);
3152 maplen = PyUnicode_GET_SIZE(mapping);
3153 while (s < e) {
3154 unsigned char ch = *s;
3155 Py_UNICODE x = 0xfffe; /* illegal value */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003156
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003157 if (ch < maplen)
3158 x = mapstring[ch];
Guido van Rossumd57fd912000-03-10 22:53:23 +00003159
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003160 if (x == 0xfffe) {
3161 /* undefined mapping */
3162 outpos = p-PyUnicode_AS_UNICODE(v);
3163 startinpos = s-starts;
3164 endinpos = startinpos+1;
3165 if (unicode_decode_call_errorhandler(
3166 errors, &errorHandler,
3167 "charmap", "character maps to <undefined>",
3168 starts, size, &startinpos, &endinpos, &exc, &s,
3169 (PyObject **)&v, &outpos, &p)) {
3170 goto onError;
Marc-André Lemburgec233e52001-01-06 14:59:58 +00003171 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003172 continue;
Marc-André Lemburgec233e52001-01-06 14:59:58 +00003173 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003174 *p++ = x;
3175 ++s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003176 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003177 }
3178 else {
3179 while (s < e) {
3180 unsigned char ch = *s;
3181 PyObject *w, *x;
3182
3183 /* Get mapping (char ordinal -> integer, Unicode char or None) */
3184 w = PyInt_FromLong((long)ch);
3185 if (w == NULL)
3186 goto onError;
3187 x = PyObject_GetItem(mapping, w);
3188 Py_DECREF(w);
3189 if (x == NULL) {
3190 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
3191 /* No mapping found means: mapping is undefined. */
3192 PyErr_Clear();
3193 x = Py_None;
3194 Py_INCREF(x);
3195 } else
3196 goto onError;
3197 }
3198
3199 /* Apply mapping */
3200 if (PyInt_Check(x)) {
3201 long value = PyInt_AS_LONG(x);
3202 if (value < 0 || value > 65535) {
3203 PyErr_SetString(PyExc_TypeError,
3204 "character mapping must be in range(65536)");
3205 Py_DECREF(x);
3206 goto onError;
3207 }
3208 *p++ = (Py_UNICODE)value;
3209 }
3210 else if (x == Py_None) {
3211 /* undefined mapping */
3212 outpos = p-PyUnicode_AS_UNICODE(v);
3213 startinpos = s-starts;
3214 endinpos = startinpos+1;
3215 if (unicode_decode_call_errorhandler(
3216 errors, &errorHandler,
3217 "charmap", "character maps to <undefined>",
3218 starts, size, &startinpos, &endinpos, &exc, &s,
3219 (PyObject **)&v, &outpos, &p)) {
3220 Py_DECREF(x);
3221 goto onError;
3222 }
Walter Dörwaldd4fff172005-11-28 22:15:56 +00003223 Py_DECREF(x);
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003224 continue;
3225 }
3226 else if (PyUnicode_Check(x)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00003227 Py_ssize_t targetsize = PyUnicode_GET_SIZE(x);
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003228
3229 if (targetsize == 1)
3230 /* 1-1 mapping */
3231 *p++ = *PyUnicode_AS_UNICODE(x);
3232
3233 else if (targetsize > 1) {
3234 /* 1-n mapping */
3235 if (targetsize > extrachars) {
3236 /* resize first */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003237 Py_ssize_t oldpos = p - PyUnicode_AS_UNICODE(v);
3238 Py_ssize_t needed = (targetsize - extrachars) + \
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003239 (targetsize << 2);
3240 extrachars += needed;
Thomas Wouters89f507f2006-12-13 04:49:30 +00003241 /* XXX overflow detection missing */
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003242 if (_PyUnicode_Resize(&v,
3243 PyUnicode_GET_SIZE(v) + needed) < 0) {
3244 Py_DECREF(x);
3245 goto onError;
3246 }
3247 p = PyUnicode_AS_UNICODE(v) + oldpos;
3248 }
3249 Py_UNICODE_COPY(p,
3250 PyUnicode_AS_UNICODE(x),
3251 targetsize);
3252 p += targetsize;
3253 extrachars -= targetsize;
3254 }
3255 /* 1-0 mapping: skip the character */
3256 }
3257 else {
3258 /* wrong return value */
3259 PyErr_SetString(PyExc_TypeError,
3260 "character mapping must return integer, None or unicode");
3261 Py_DECREF(x);
3262 goto onError;
3263 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003264 Py_DECREF(x);
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003265 ++s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003266 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003267 }
3268 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003269 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003270 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003271 Py_XDECREF(errorHandler);
3272 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003273 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00003274
Guido van Rossumd57fd912000-03-10 22:53:23 +00003275 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003276 Py_XDECREF(errorHandler);
3277 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003278 Py_XDECREF(v);
3279 return NULL;
3280}
3281
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003282/* Charmap encoding: the lookup table */
3283
3284struct encoding_map{
3285 PyObject_HEAD
3286 unsigned char level1[32];
3287 int count2, count3;
3288 unsigned char level23[1];
3289};
3290
3291static PyObject*
3292encoding_map_size(PyObject *obj, PyObject* args)
3293{
3294 struct encoding_map *map = (struct encoding_map*)obj;
3295 return PyInt_FromLong(sizeof(*map) - 1 + 16*map->count2 +
3296 128*map->count3);
3297}
3298
3299static PyMethodDef encoding_map_methods[] = {
3300 {"size", encoding_map_size, METH_NOARGS,
3301 PyDoc_STR("Return the size (in bytes) of this object") },
3302 { 0 }
3303};
3304
3305static void
3306encoding_map_dealloc(PyObject* o)
3307{
3308 PyObject_FREE(o);
3309}
3310
3311static PyTypeObject EncodingMapType = {
3312 PyObject_HEAD_INIT(NULL)
3313 0, /*ob_size*/
3314 "EncodingMap", /*tp_name*/
3315 sizeof(struct encoding_map), /*tp_basicsize*/
3316 0, /*tp_itemsize*/
3317 /* methods */
3318 encoding_map_dealloc, /*tp_dealloc*/
3319 0, /*tp_print*/
3320 0, /*tp_getattr*/
3321 0, /*tp_setattr*/
3322 0, /*tp_compare*/
3323 0, /*tp_repr*/
3324 0, /*tp_as_number*/
3325 0, /*tp_as_sequence*/
3326 0, /*tp_as_mapping*/
3327 0, /*tp_hash*/
3328 0, /*tp_call*/
3329 0, /*tp_str*/
3330 0, /*tp_getattro*/
3331 0, /*tp_setattro*/
3332 0, /*tp_as_buffer*/
3333 Py_TPFLAGS_DEFAULT, /*tp_flags*/
3334 0, /*tp_doc*/
3335 0, /*tp_traverse*/
3336 0, /*tp_clear*/
3337 0, /*tp_richcompare*/
3338 0, /*tp_weaklistoffset*/
3339 0, /*tp_iter*/
3340 0, /*tp_iternext*/
3341 encoding_map_methods, /*tp_methods*/
3342 0, /*tp_members*/
3343 0, /*tp_getset*/
3344 0, /*tp_base*/
3345 0, /*tp_dict*/
3346 0, /*tp_descr_get*/
3347 0, /*tp_descr_set*/
3348 0, /*tp_dictoffset*/
3349 0, /*tp_init*/
3350 0, /*tp_alloc*/
3351 0, /*tp_new*/
3352 0, /*tp_free*/
3353 0, /*tp_is_gc*/
3354};
3355
3356PyObject*
3357PyUnicode_BuildEncodingMap(PyObject* string)
3358{
3359 Py_UNICODE *decode;
3360 PyObject *result;
3361 struct encoding_map *mresult;
3362 int i;
3363 int need_dict = 0;
3364 unsigned char level1[32];
3365 unsigned char level2[512];
3366 unsigned char *mlevel1, *mlevel2, *mlevel3;
3367 int count2 = 0, count3 = 0;
3368
3369 if (!PyUnicode_Check(string) || PyUnicode_GetSize(string) != 256) {
3370 PyErr_BadArgument();
3371 return NULL;
3372 }
3373 decode = PyUnicode_AS_UNICODE(string);
3374 memset(level1, 0xFF, sizeof level1);
3375 memset(level2, 0xFF, sizeof level2);
3376
3377 /* If there isn't a one-to-one mapping of NULL to \0,
3378 or if there are non-BMP characters, we need to use
3379 a mapping dictionary. */
3380 if (decode[0] != 0)
3381 need_dict = 1;
3382 for (i = 1; i < 256; i++) {
3383 int l1, l2;
3384 if (decode[i] == 0
3385 #ifdef Py_UNICODE_WIDE
3386 || decode[i] > 0xFFFF
3387 #endif
3388 ) {
3389 need_dict = 1;
3390 break;
3391 }
3392 if (decode[i] == 0xFFFE)
3393 /* unmapped character */
3394 continue;
3395 l1 = decode[i] >> 11;
3396 l2 = decode[i] >> 7;
3397 if (level1[l1] == 0xFF)
3398 level1[l1] = count2++;
3399 if (level2[l2] == 0xFF)
3400 level2[l2] = count3++;
3401 }
3402
3403 if (count2 >= 0xFF || count3 >= 0xFF)
3404 need_dict = 1;
3405
3406 if (need_dict) {
3407 PyObject *result = PyDict_New();
3408 PyObject *key, *value;
3409 if (!result)
3410 return NULL;
3411 for (i = 0; i < 256; i++) {
3412 key = value = NULL;
3413 key = PyInt_FromLong(decode[i]);
3414 value = PyInt_FromLong(i);
3415 if (!key || !value)
3416 goto failed1;
3417 if (PyDict_SetItem(result, key, value) == -1)
3418 goto failed1;
3419 Py_DECREF(key);
3420 Py_DECREF(value);
3421 }
3422 return result;
3423 failed1:
3424 Py_XDECREF(key);
3425 Py_XDECREF(value);
3426 Py_DECREF(result);
3427 return NULL;
3428 }
3429
3430 /* Create a three-level trie */
3431 result = PyObject_MALLOC(sizeof(struct encoding_map) +
3432 16*count2 + 128*count3 - 1);
3433 if (!result)
3434 return PyErr_NoMemory();
3435 PyObject_Init(result, &EncodingMapType);
3436 mresult = (struct encoding_map*)result;
3437 mresult->count2 = count2;
3438 mresult->count3 = count3;
3439 mlevel1 = mresult->level1;
3440 mlevel2 = mresult->level23;
3441 mlevel3 = mresult->level23 + 16*count2;
3442 memcpy(mlevel1, level1, 32);
3443 memset(mlevel2, 0xFF, 16*count2);
3444 memset(mlevel3, 0, 128*count3);
3445 count3 = 0;
3446 for (i = 1; i < 256; i++) {
3447 int o1, o2, o3, i2, i3;
3448 if (decode[i] == 0xFFFE)
3449 /* unmapped character */
3450 continue;
3451 o1 = decode[i]>>11;
3452 o2 = (decode[i]>>7) & 0xF;
3453 i2 = 16*mlevel1[o1] + o2;
3454 if (mlevel2[i2] == 0xFF)
3455 mlevel2[i2] = count3++;
3456 o3 = decode[i] & 0x7F;
3457 i3 = 128*mlevel2[i2] + o3;
3458 mlevel3[i3] = i;
3459 }
3460 return result;
3461}
3462
3463static int
3464encoding_map_lookup(Py_UNICODE c, PyObject *mapping)
3465{
3466 struct encoding_map *map = (struct encoding_map*)mapping;
3467 int l1 = c>>11;
3468 int l2 = (c>>7) & 0xF;
3469 int l3 = c & 0x7F;
3470 int i;
3471
3472#ifdef Py_UNICODE_WIDE
3473 if (c > 0xFFFF) {
3474 return -1;
3475 }
3476#endif
3477 if (c == 0)
3478 return 0;
3479 /* level 1*/
3480 i = map->level1[l1];
3481 if (i == 0xFF) {
3482 return -1;
3483 }
3484 /* level 2*/
3485 i = map->level23[16*i+l2];
3486 if (i == 0xFF) {
3487 return -1;
3488 }
3489 /* level 3 */
3490 i = map->level23[16*map->count2 + 128*i + l3];
3491 if (i == 0) {
3492 return -1;
3493 }
3494 return i;
3495}
3496
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003497/* Lookup the character ch in the mapping. If the character
3498 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00003499 error occurred). */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003500static PyObject *charmapencode_lookup(Py_UNICODE c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003501{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003502 PyObject *w = PyInt_FromLong((long)c);
3503 PyObject *x;
3504
3505 if (w == NULL)
3506 return NULL;
3507 x = PyObject_GetItem(mapping, w);
3508 Py_DECREF(w);
3509 if (x == NULL) {
3510 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
3511 /* No mapping found means: mapping is undefined. */
3512 PyErr_Clear();
3513 x = Py_None;
3514 Py_INCREF(x);
3515 return x;
3516 } else
3517 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003518 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00003519 else if (x == Py_None)
3520 return x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003521 else if (PyInt_Check(x)) {
3522 long value = PyInt_AS_LONG(x);
3523 if (value < 0 || value > 255) {
3524 PyErr_SetString(PyExc_TypeError,
3525 "character mapping must be in range(256)");
3526 Py_DECREF(x);
3527 return NULL;
3528 }
3529 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003530 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003531 else if (PyString_Check(x))
3532 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003533 else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003534 /* wrong return value */
Walter Dörwald580ceed2007-05-09 10:39:19 +00003535 PyErr_Format(PyExc_TypeError,
3536 "character mapping must return integer, None or str8, not %.400s",
3537 x->ob_type->tp_name);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003538 Py_DECREF(x);
3539 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003540 }
3541}
3542
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003543static int
Walter Dörwald827b0552007-05-12 13:23:53 +00003544charmapencode_resize(PyObject *outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003545{
Walter Dörwald827b0552007-05-12 13:23:53 +00003546 Py_ssize_t outsize = PyBytes_GET_SIZE( outobj);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003547 /* exponentially overallocate to minimize reallocations */
3548 if (requiredsize < 2*outsize)
3549 requiredsize = 2*outsize;
Walter Dörwald827b0552007-05-12 13:23:53 +00003550 if (PyBytes_Resize(outobj, requiredsize)) {
3551 Py_DECREF(outobj);
3552 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003553 }
Walter Dörwald827b0552007-05-12 13:23:53 +00003554 return 0;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003555}
3556
3557typedef enum charmapencode_result {
3558 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
3559}charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003560/* lookup the character, put the result in the output string and adjust
Walter Dörwald827b0552007-05-12 13:23:53 +00003561 various state variables. Resize the output bytes object if not enough
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003562 space is available. Return a new reference to the object that
3563 was put in the output buffer, or Py_None, if the mapping was undefined
3564 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00003565 reallocation error occurred. The caller must decref the result */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003566static
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003567charmapencode_result charmapencode_output(Py_UNICODE c, PyObject *mapping,
Walter Dörwald827b0552007-05-12 13:23:53 +00003568 PyObject *outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003569{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003570 PyObject *rep;
3571 char *outstart;
Walter Dörwald827b0552007-05-12 13:23:53 +00003572 Py_ssize_t outsize = PyBytes_GET_SIZE(outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003573
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003574 if (mapping->ob_type == &EncodingMapType) {
3575 int res = encoding_map_lookup(c, mapping);
3576 Py_ssize_t requiredsize = *outpos+1;
3577 if (res == -1)
3578 return enc_FAILED;
3579 if (outsize<requiredsize)
Walter Dörwald827b0552007-05-12 13:23:53 +00003580 if (charmapencode_resize(outobj, outpos, requiredsize))
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003581 return enc_EXCEPTION;
Walter Dörwald827b0552007-05-12 13:23:53 +00003582 outstart = PyBytes_AS_STRING(outobj);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003583 outstart[(*outpos)++] = (char)res;
3584 return enc_SUCCESS;
3585 }
3586
3587 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003588 if (rep==NULL)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003589 return enc_EXCEPTION;
3590 else if (rep==Py_None) {
3591 Py_DECREF(rep);
3592 return enc_FAILED;
3593 } else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003594 if (PyInt_Check(rep)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00003595 Py_ssize_t requiredsize = *outpos+1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003596 if (outsize<requiredsize)
Walter Dörwald827b0552007-05-12 13:23:53 +00003597 if (charmapencode_resize(outobj, outpos, requiredsize)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003598 Py_DECREF(rep);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003599 return enc_EXCEPTION;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003600 }
Walter Dörwald827b0552007-05-12 13:23:53 +00003601 outstart = PyBytes_AS_STRING(outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003602 outstart[(*outpos)++] = (char)PyInt_AS_LONG(rep);
3603 }
3604 else {
3605 const char *repchars = PyString_AS_STRING(rep);
Martin v. Löwis18e16552006-02-15 17:27:45 +00003606 Py_ssize_t repsize = PyString_GET_SIZE(rep);
3607 Py_ssize_t requiredsize = *outpos+repsize;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003608 if (outsize<requiredsize)
Walter Dörwald827b0552007-05-12 13:23:53 +00003609 if (charmapencode_resize(outobj, outpos, requiredsize)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003610 Py_DECREF(rep);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003611 return enc_EXCEPTION;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003612 }
Walter Dörwald827b0552007-05-12 13:23:53 +00003613 outstart = PyBytes_AS_STRING(outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003614 memcpy(outstart + *outpos, repchars, repsize);
3615 *outpos += repsize;
3616 }
3617 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003618 Py_DECREF(rep);
3619 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003620}
3621
3622/* handle an error in PyUnicode_EncodeCharmap
3623 Return 0 on success, -1 on error */
3624static
3625int charmap_encoding_error(
Martin v. Löwis18e16552006-02-15 17:27:45 +00003626 const Py_UNICODE *p, Py_ssize_t size, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003627 PyObject **exceptionObject,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00003628 int *known_errorHandler, PyObject **errorHandler, const char *errors,
Walter Dörwald827b0552007-05-12 13:23:53 +00003629 PyObject *res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003630{
3631 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003632 Py_ssize_t repsize;
3633 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003634 Py_UNICODE *uni2;
3635 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003636 Py_ssize_t collstartpos = *inpos;
3637 Py_ssize_t collendpos = *inpos+1;
3638 Py_ssize_t collpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003639 char *encoding = "charmap";
3640 char *reason = "character maps to <undefined>";
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003641 charmapencode_result x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003642
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003643 /* find all unencodable characters */
3644 while (collendpos < size) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003645 PyObject *rep;
3646 if (mapping->ob_type == &EncodingMapType) {
3647 int res = encoding_map_lookup(p[collendpos], mapping);
3648 if (res != -1)
3649 break;
3650 ++collendpos;
3651 continue;
3652 }
3653
3654 rep = charmapencode_lookup(p[collendpos], mapping);
3655 if (rep==NULL)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003656 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003657 else if (rep!=Py_None) {
3658 Py_DECREF(rep);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003659 break;
3660 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003661 Py_DECREF(rep);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003662 ++collendpos;
3663 }
3664 /* cache callback name lookup
3665 * (if not done yet, i.e. it's the first error) */
3666 if (*known_errorHandler==-1) {
3667 if ((errors==NULL) || (!strcmp(errors, "strict")))
3668 *known_errorHandler = 1;
3669 else if (!strcmp(errors, "replace"))
3670 *known_errorHandler = 2;
3671 else if (!strcmp(errors, "ignore"))
3672 *known_errorHandler = 3;
3673 else if (!strcmp(errors, "xmlcharrefreplace"))
3674 *known_errorHandler = 4;
3675 else
3676 *known_errorHandler = 0;
3677 }
3678 switch (*known_errorHandler) {
3679 case 1: /* strict */
3680 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
3681 return -1;
3682 case 2: /* replace */
3683 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
3684 x = charmapencode_output('?', mapping, res, respos);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003685 if (x==enc_EXCEPTION) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003686 return -1;
3687 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003688 else if (x==enc_FAILED) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003689 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
3690 return -1;
3691 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003692 }
3693 /* fall through */
3694 case 3: /* ignore */
3695 *inpos = collendpos;
3696 break;
3697 case 4: /* xmlcharrefreplace */
3698 /* generate replacement (temporarily (mis)uses p) */
3699 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
3700 char buffer[2+29+1+1];
3701 char *cp;
3702 sprintf(buffer, "&#%d;", (int)p[collpos]);
3703 for (cp = buffer; *cp; ++cp) {
3704 x = charmapencode_output(*cp, mapping, res, respos);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003705 if (x==enc_EXCEPTION)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003706 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003707 else if (x==enc_FAILED) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003708 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
3709 return -1;
3710 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003711 }
3712 }
3713 *inpos = collendpos;
3714 break;
3715 default:
Walter Dörwalde5402fb2003-08-14 20:25:29 +00003716 repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003717 encoding, reason, p, size, exceptionObject,
3718 collstartpos, collendpos, &newpos);
3719 if (repunicode == NULL)
3720 return -1;
3721 /* generate replacement */
3722 repsize = PyUnicode_GET_SIZE(repunicode);
3723 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
3724 x = charmapencode_output(*uni2, mapping, res, respos);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003725 if (x==enc_EXCEPTION) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003726 return -1;
3727 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003728 else if (x==enc_FAILED) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003729 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003730 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
3731 return -1;
3732 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003733 }
3734 *inpos = newpos;
3735 Py_DECREF(repunicode);
3736 }
3737 return 0;
3738}
3739
Guido van Rossumd57fd912000-03-10 22:53:23 +00003740PyObject *PyUnicode_EncodeCharmap(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003741 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003742 PyObject *mapping,
3743 const char *errors)
3744{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003745 /* output object */
3746 PyObject *res = NULL;
3747 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003748 Py_ssize_t inpos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003749 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003750 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003751 PyObject *errorHandler = NULL;
3752 PyObject *exc = NULL;
3753 /* the following variable is used for caching string comparisons
3754 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
3755 * 3=ignore, 4=xmlcharrefreplace */
3756 int known_errorHandler = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003757
3758 /* Default to Latin-1 */
3759 if (mapping == NULL)
3760 return PyUnicode_EncodeLatin1(p, size, errors);
3761
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003762 /* allocate enough for a simple encoding without
3763 replacements, if we need more, we'll resize */
Walter Dörwald827b0552007-05-12 13:23:53 +00003764 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003765 if (res == NULL)
3766 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00003767 if (size == 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003768 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003769
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003770 while (inpos<size) {
3771 /* try to encode it */
Walter Dörwald827b0552007-05-12 13:23:53 +00003772 charmapencode_result x = charmapencode_output(p[inpos], mapping, res, &respos);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003773 if (x==enc_EXCEPTION) /* error */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003774 goto onError;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003775 if (x==enc_FAILED) { /* unencodable character */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003776 if (charmap_encoding_error(p, size, &inpos, mapping,
3777 &exc,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00003778 &known_errorHandler, &errorHandler, errors,
Walter Dörwald827b0552007-05-12 13:23:53 +00003779 res, &respos)) {
Marc-André Lemburg3a645e42001-01-16 11:54:12 +00003780 goto onError;
Walter Dörwald9b30f202003-08-15 16:26:34 +00003781 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003782 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003783 else
3784 /* done with this character => adjust input position */
3785 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003786 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003787
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003788 /* Resize if we allocated to much */
Walter Dörwald827b0552007-05-12 13:23:53 +00003789 if (respos<PyBytes_GET_SIZE(res)) {
3790 if (PyBytes_Resize(res, respos))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003791 goto onError;
3792 }
3793 Py_XDECREF(exc);
3794 Py_XDECREF(errorHandler);
3795 return res;
3796
3797 onError:
3798 Py_XDECREF(res);
3799 Py_XDECREF(exc);
3800 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003801 return NULL;
3802}
3803
3804PyObject *PyUnicode_AsCharmapString(PyObject *unicode,
3805 PyObject *mapping)
3806{
3807 if (!PyUnicode_Check(unicode) || mapping == NULL) {
3808 PyErr_BadArgument();
3809 return NULL;
3810 }
3811 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
3812 PyUnicode_GET_SIZE(unicode),
3813 mapping,
3814 NULL);
3815}
3816
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003817/* create or adjust a UnicodeTranslateError */
3818static void make_translate_exception(PyObject **exceptionObject,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003819 const Py_UNICODE *unicode, Py_ssize_t size,
3820 Py_ssize_t startpos, Py_ssize_t endpos,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003821 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003822{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003823 if (*exceptionObject == NULL) {
3824 *exceptionObject = PyUnicodeTranslateError_Create(
3825 unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003826 }
3827 else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003828 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
3829 goto onError;
3830 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
3831 goto onError;
3832 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
3833 goto onError;
3834 return;
3835 onError:
3836 Py_DECREF(*exceptionObject);
3837 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003838 }
3839}
3840
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003841/* raises a UnicodeTranslateError */
3842static void raise_translate_exception(PyObject **exceptionObject,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003843 const Py_UNICODE *unicode, Py_ssize_t size,
3844 Py_ssize_t startpos, Py_ssize_t endpos,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003845 const char *reason)
3846{
3847 make_translate_exception(exceptionObject,
3848 unicode, size, startpos, endpos, reason);
3849 if (*exceptionObject != NULL)
3850 PyCodec_StrictErrors(*exceptionObject);
3851}
3852
3853/* error handling callback helper:
3854 build arguments, call the callback and check the arguments,
3855 put the result into newpos and return the replacement string, which
3856 has to be freed by the caller */
3857static PyObject *unicode_translate_call_errorhandler(const char *errors,
3858 PyObject **errorHandler,
3859 const char *reason,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003860 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
3861 Py_ssize_t startpos, Py_ssize_t endpos,
3862 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003863{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003864 static char *argparse = "O!n;translating error handler must return (unicode, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003865
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003866 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003867 PyObject *restuple;
3868 PyObject *resunicode;
3869
3870 if (*errorHandler == NULL) {
3871 *errorHandler = PyCodec_LookupError(errors);
3872 if (*errorHandler == NULL)
3873 return NULL;
3874 }
3875
3876 make_translate_exception(exceptionObject,
3877 unicode, size, startpos, endpos, reason);
3878 if (*exceptionObject == NULL)
3879 return NULL;
3880
3881 restuple = PyObject_CallFunctionObjArgs(
3882 *errorHandler, *exceptionObject, NULL);
3883 if (restuple == NULL)
3884 return NULL;
3885 if (!PyTuple_Check(restuple)) {
3886 PyErr_Format(PyExc_TypeError, &argparse[4]);
3887 Py_DECREF(restuple);
3888 return NULL;
3889 }
3890 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003891 &resunicode, &i_newpos)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003892 Py_DECREF(restuple);
3893 return NULL;
3894 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00003895 if (i_newpos<0)
3896 *newpos = size+i_newpos;
3897 else
3898 *newpos = i_newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003899 if (*newpos<0 || *newpos>size) {
Martin v. Löwis2c95cc62006-02-16 06:54:25 +00003900 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003901 Py_DECREF(restuple);
3902 return NULL;
3903 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003904 Py_INCREF(resunicode);
3905 Py_DECREF(restuple);
3906 return resunicode;
3907}
3908
3909/* Lookup the character ch in the mapping and put the result in result,
3910 which must be decrefed by the caller.
3911 Return 0 on success, -1 on error */
3912static
3913int charmaptranslate_lookup(Py_UNICODE c, PyObject *mapping, PyObject **result)
3914{
3915 PyObject *w = PyInt_FromLong((long)c);
3916 PyObject *x;
3917
3918 if (w == NULL)
3919 return -1;
3920 x = PyObject_GetItem(mapping, w);
3921 Py_DECREF(w);
3922 if (x == NULL) {
3923 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
3924 /* No mapping found means: use 1:1 mapping. */
3925 PyErr_Clear();
3926 *result = NULL;
3927 return 0;
3928 } else
3929 return -1;
3930 }
3931 else if (x == Py_None) {
3932 *result = x;
3933 return 0;
3934 }
3935 else if (PyInt_Check(x)) {
3936 long value = PyInt_AS_LONG(x);
3937 long max = PyUnicode_GetMax();
3938 if (value < 0 || value > max) {
3939 PyErr_Format(PyExc_TypeError,
3940 "character mapping must be in range(0x%lx)", max+1);
3941 Py_DECREF(x);
3942 return -1;
3943 }
3944 *result = x;
3945 return 0;
3946 }
3947 else if (PyUnicode_Check(x)) {
3948 *result = x;
3949 return 0;
3950 }
3951 else {
3952 /* wrong return value */
3953 PyErr_SetString(PyExc_TypeError,
3954 "character mapping must return integer, None or unicode");
Walter Dörwald150523e2003-08-15 16:52:19 +00003955 Py_DECREF(x);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003956 return -1;
3957 }
3958}
3959/* ensure that *outobj is at least requiredsize characters long,
3960if not reallocate and adjust various state variables.
3961Return 0 on success, -1 on error */
3962static
Walter Dörwald4894c302003-10-24 14:25:28 +00003963int charmaptranslate_makespace(PyObject **outobj, Py_UNICODE **outp,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003964 Py_ssize_t requiredsize)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003965{
Martin v. Löwis18e16552006-02-15 17:27:45 +00003966 Py_ssize_t oldsize = PyUnicode_GET_SIZE(*outobj);
Walter Dörwald4894c302003-10-24 14:25:28 +00003967 if (requiredsize > oldsize) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003968 /* remember old output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003969 Py_ssize_t outpos = *outp-PyUnicode_AS_UNICODE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003970 /* exponentially overallocate to minimize reallocations */
Walter Dörwald4894c302003-10-24 14:25:28 +00003971 if (requiredsize < 2 * oldsize)
3972 requiredsize = 2 * oldsize;
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00003973 if (_PyUnicode_Resize(outobj, requiredsize) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003974 return -1;
3975 *outp = PyUnicode_AS_UNICODE(*outobj) + outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003976 }
3977 return 0;
3978}
3979/* lookup the character, put the result in the output string and adjust
3980 various state variables. Return a new reference to the object that
3981 was put in the output buffer in *result, or Py_None, if the mapping was
3982 undefined (in which case no character was written).
3983 The called must decref result.
3984 Return 0 on success, -1 on error. */
3985static
Walter Dörwald4894c302003-10-24 14:25:28 +00003986int charmaptranslate_output(const Py_UNICODE *startinp, const Py_UNICODE *curinp,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003987 Py_ssize_t insize, PyObject *mapping, PyObject **outobj, Py_UNICODE **outp,
Walter Dörwald4894c302003-10-24 14:25:28 +00003988 PyObject **res)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003989{
Walter Dörwald4894c302003-10-24 14:25:28 +00003990 if (charmaptranslate_lookup(*curinp, mapping, res))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003991 return -1;
3992 if (*res==NULL) {
3993 /* not found => default to 1:1 mapping */
Walter Dörwald4894c302003-10-24 14:25:28 +00003994 *(*outp)++ = *curinp;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003995 }
3996 else if (*res==Py_None)
3997 ;
3998 else if (PyInt_Check(*res)) {
3999 /* no overflow check, because we know that the space is enough */
4000 *(*outp)++ = (Py_UNICODE)PyInt_AS_LONG(*res);
4001 }
4002 else if (PyUnicode_Check(*res)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00004003 Py_ssize_t repsize = PyUnicode_GET_SIZE(*res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004004 if (repsize==1) {
4005 /* no overflow check, because we know that the space is enough */
4006 *(*outp)++ = *PyUnicode_AS_UNICODE(*res);
4007 }
4008 else if (repsize!=0) {
4009 /* more than one character */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004010 Py_ssize_t requiredsize = (*outp-PyUnicode_AS_UNICODE(*outobj)) +
Walter Dörwaldcd736e72004-02-05 17:36:00 +00004011 (insize - (curinp-startinp)) +
Walter Dörwald4894c302003-10-24 14:25:28 +00004012 repsize - 1;
4013 if (charmaptranslate_makespace(outobj, outp, requiredsize))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004014 return -1;
4015 memcpy(*outp, PyUnicode_AS_UNICODE(*res), sizeof(Py_UNICODE)*repsize);
4016 *outp += repsize;
4017 }
4018 }
4019 else
4020 return -1;
4021 return 0;
4022}
4023
4024PyObject *PyUnicode_TranslateCharmap(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004025 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004026 PyObject *mapping,
4027 const char *errors)
4028{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004029 /* output object */
4030 PyObject *res = NULL;
4031 /* pointers to the beginning and end+1 of input */
4032 const Py_UNICODE *startp = p;
4033 const Py_UNICODE *endp = p + size;
4034 /* pointer into the output */
4035 Py_UNICODE *str;
4036 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004037 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004038 char *reason = "character maps to <undefined>";
4039 PyObject *errorHandler = NULL;
4040 PyObject *exc = NULL;
4041 /* the following variable is used for caching string comparisons
4042 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
4043 * 3=ignore, 4=xmlcharrefreplace */
4044 int known_errorHandler = -1;
4045
Guido van Rossumd57fd912000-03-10 22:53:23 +00004046 if (mapping == NULL) {
4047 PyErr_BadArgument();
4048 return NULL;
4049 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004050
4051 /* allocate enough for a simple 1:1 translation without
4052 replacements, if we need more, we'll resize */
4053 res = PyUnicode_FromUnicode(NULL, size);
4054 if (res == NULL)
Walter Dörwald4894c302003-10-24 14:25:28 +00004055 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004056 if (size == 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004057 return res;
4058 str = PyUnicode_AS_UNICODE(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004059
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004060 while (p<endp) {
4061 /* try to encode it */
4062 PyObject *x = NULL;
Walter Dörwald4894c302003-10-24 14:25:28 +00004063 if (charmaptranslate_output(startp, p, size, mapping, &res, &str, &x)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004064 Py_XDECREF(x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004065 goto onError;
4066 }
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00004067 Py_XDECREF(x);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004068 if (x!=Py_None) /* it worked => adjust input pointer */
4069 ++p;
4070 else { /* untranslatable character */
4071 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004072 Py_ssize_t repsize;
4073 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004074 Py_UNICODE *uni2;
4075 /* startpos for collecting untranslatable chars */
4076 const Py_UNICODE *collstart = p;
4077 const Py_UNICODE *collend = p+1;
4078 const Py_UNICODE *coll;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004079
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004080 /* find all untranslatable characters */
4081 while (collend < endp) {
Walter Dörwald4894c302003-10-24 14:25:28 +00004082 if (charmaptranslate_lookup(*collend, mapping, &x))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004083 goto onError;
4084 Py_XDECREF(x);
4085 if (x!=Py_None)
4086 break;
4087 ++collend;
4088 }
4089 /* cache callback name lookup
4090 * (if not done yet, i.e. it's the first error) */
4091 if (known_errorHandler==-1) {
4092 if ((errors==NULL) || (!strcmp(errors, "strict")))
4093 known_errorHandler = 1;
4094 else if (!strcmp(errors, "replace"))
4095 known_errorHandler = 2;
4096 else if (!strcmp(errors, "ignore"))
4097 known_errorHandler = 3;
4098 else if (!strcmp(errors, "xmlcharrefreplace"))
4099 known_errorHandler = 4;
4100 else
4101 known_errorHandler = 0;
4102 }
4103 switch (known_errorHandler) {
4104 case 1: /* strict */
4105 raise_translate_exception(&exc, startp, size, collstart-startp, collend-startp, reason);
4106 goto onError;
4107 case 2: /* replace */
4108 /* No need to check for space, this is a 1:1 replacement */
4109 for (coll = collstart; coll<collend; ++coll)
4110 *str++ = '?';
4111 /* fall through */
4112 case 3: /* ignore */
4113 p = collend;
4114 break;
4115 case 4: /* xmlcharrefreplace */
4116 /* generate replacement (temporarily (mis)uses p) */
4117 for (p = collstart; p < collend; ++p) {
4118 char buffer[2+29+1+1];
4119 char *cp;
4120 sprintf(buffer, "&#%d;", (int)*p);
Walter Dörwald4894c302003-10-24 14:25:28 +00004121 if (charmaptranslate_makespace(&res, &str,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004122 (str-PyUnicode_AS_UNICODE(res))+strlen(buffer)+(endp-collend)))
4123 goto onError;
4124 for (cp = buffer; *cp; ++cp)
4125 *str++ = *cp;
4126 }
4127 p = collend;
4128 break;
4129 default:
4130 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
4131 reason, startp, size, &exc,
4132 collstart-startp, collend-startp, &newpos);
4133 if (repunicode == NULL)
4134 goto onError;
4135 /* generate replacement */
4136 repsize = PyUnicode_GET_SIZE(repunicode);
Walter Dörwald4894c302003-10-24 14:25:28 +00004137 if (charmaptranslate_makespace(&res, &str,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004138 (str-PyUnicode_AS_UNICODE(res))+repsize+(endp-collend))) {
4139 Py_DECREF(repunicode);
4140 goto onError;
4141 }
4142 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2)
4143 *str++ = *uni2;
4144 p = startp + newpos;
4145 Py_DECREF(repunicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004146 }
4147 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004148 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004149 /* Resize if we allocated to much */
4150 respos = str-PyUnicode_AS_UNICODE(res);
Walter Dörwald4894c302003-10-24 14:25:28 +00004151 if (respos<PyUnicode_GET_SIZE(res)) {
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00004152 if (_PyUnicode_Resize(&res, respos) < 0)
Guido van Rossumfd4b9572000-04-10 13:51:10 +00004153 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004154 }
4155 Py_XDECREF(exc);
4156 Py_XDECREF(errorHandler);
4157 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004158
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004159 onError:
4160 Py_XDECREF(res);
4161 Py_XDECREF(exc);
4162 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004163 return NULL;
4164}
4165
4166PyObject *PyUnicode_Translate(PyObject *str,
4167 PyObject *mapping,
4168 const char *errors)
4169{
4170 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00004171
Guido van Rossumd57fd912000-03-10 22:53:23 +00004172 str = PyUnicode_FromObject(str);
4173 if (str == NULL)
4174 goto onError;
4175 result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str),
4176 PyUnicode_GET_SIZE(str),
4177 mapping,
4178 errors);
4179 Py_DECREF(str);
4180 return result;
Tim Petersced69f82003-09-16 20:30:58 +00004181
Guido van Rossumd57fd912000-03-10 22:53:23 +00004182 onError:
4183 Py_XDECREF(str);
4184 return NULL;
4185}
Tim Petersced69f82003-09-16 20:30:58 +00004186
Guido van Rossum9e896b32000-04-05 20:11:21 +00004187/* --- Decimal Encoder ---------------------------------------------------- */
4188
4189int PyUnicode_EncodeDecimal(Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004190 Py_ssize_t length,
Guido van Rossum9e896b32000-04-05 20:11:21 +00004191 char *output,
4192 const char *errors)
4193{
4194 Py_UNICODE *p, *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004195 PyObject *errorHandler = NULL;
4196 PyObject *exc = NULL;
4197 const char *encoding = "decimal";
4198 const char *reason = "invalid decimal Unicode string";
4199 /* the following variable is used for caching string comparisons
4200 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
4201 int known_errorHandler = -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00004202
4203 if (output == NULL) {
4204 PyErr_BadArgument();
4205 return -1;
4206 }
4207
4208 p = s;
4209 end = s + length;
4210 while (p < end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004211 register Py_UNICODE ch = *p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00004212 int decimal;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004213 PyObject *repunicode;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004214 Py_ssize_t repsize;
4215 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004216 Py_UNICODE *uni2;
4217 Py_UNICODE *collstart;
4218 Py_UNICODE *collend;
Tim Petersced69f82003-09-16 20:30:58 +00004219
Guido van Rossum9e896b32000-04-05 20:11:21 +00004220 if (Py_UNICODE_ISSPACE(ch)) {
4221 *output++ = ' ';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004222 ++p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00004223 continue;
4224 }
4225 decimal = Py_UNICODE_TODECIMAL(ch);
4226 if (decimal >= 0) {
4227 *output++ = '0' + decimal;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004228 ++p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00004229 continue;
4230 }
Guido van Rossumba477042000-04-06 18:18:10 +00004231 if (0 < ch && ch < 256) {
Guido van Rossum42c29aa2000-05-03 23:58:29 +00004232 *output++ = (char)ch;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004233 ++p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00004234 continue;
4235 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004236 /* All other characters are considered unencodable */
4237 collstart = p;
4238 collend = p+1;
4239 while (collend < end) {
4240 if ((0 < *collend && *collend < 256) ||
4241 !Py_UNICODE_ISSPACE(*collend) ||
4242 Py_UNICODE_TODECIMAL(*collend))
4243 break;
Guido van Rossum9e896b32000-04-05 20:11:21 +00004244 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004245 /* cache callback name lookup
4246 * (if not done yet, i.e. it's the first error) */
4247 if (known_errorHandler==-1) {
4248 if ((errors==NULL) || (!strcmp(errors, "strict")))
4249 known_errorHandler = 1;
4250 else if (!strcmp(errors, "replace"))
4251 known_errorHandler = 2;
4252 else if (!strcmp(errors, "ignore"))
4253 known_errorHandler = 3;
4254 else if (!strcmp(errors, "xmlcharrefreplace"))
4255 known_errorHandler = 4;
4256 else
4257 known_errorHandler = 0;
4258 }
4259 switch (known_errorHandler) {
4260 case 1: /* strict */
4261 raise_encode_exception(&exc, encoding, s, length, collstart-s, collend-s, reason);
4262 goto onError;
4263 case 2: /* replace */
4264 for (p = collstart; p < collend; ++p)
4265 *output++ = '?';
4266 /* fall through */
4267 case 3: /* ignore */
4268 p = collend;
4269 break;
4270 case 4: /* xmlcharrefreplace */
4271 /* generate replacement (temporarily (mis)uses p) */
4272 for (p = collstart; p < collend; ++p)
4273 output += sprintf(output, "&#%d;", (int)*p);
4274 p = collend;
4275 break;
4276 default:
4277 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
4278 encoding, reason, s, length, &exc,
4279 collstart-s, collend-s, &newpos);
4280 if (repunicode == NULL)
4281 goto onError;
4282 /* generate replacement */
4283 repsize = PyUnicode_GET_SIZE(repunicode);
4284 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
4285 Py_UNICODE ch = *uni2;
4286 if (Py_UNICODE_ISSPACE(ch))
4287 *output++ = ' ';
4288 else {
4289 decimal = Py_UNICODE_TODECIMAL(ch);
4290 if (decimal >= 0)
4291 *output++ = '0' + decimal;
4292 else if (0 < ch && ch < 256)
4293 *output++ = (char)ch;
4294 else {
4295 Py_DECREF(repunicode);
4296 raise_encode_exception(&exc, encoding,
4297 s, length, collstart-s, collend-s, reason);
4298 goto onError;
4299 }
4300 }
4301 }
4302 p = s + newpos;
4303 Py_DECREF(repunicode);
Guido van Rossum9e896b32000-04-05 20:11:21 +00004304 }
4305 }
4306 /* 0-terminate the output string */
4307 *output++ = '\0';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004308 Py_XDECREF(exc);
4309 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00004310 return 0;
4311
4312 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004313 Py_XDECREF(exc);
4314 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00004315 return -1;
4316}
4317
Guido van Rossumd57fd912000-03-10 22:53:23 +00004318/* --- Helpers ------------------------------------------------------------ */
4319
Thomas Wouters477c8d52006-05-27 19:21:47 +00004320#define STRINGLIB_CHAR Py_UNICODE
4321
4322#define STRINGLIB_LEN PyUnicode_GET_SIZE
4323#define STRINGLIB_NEW PyUnicode_FromUnicode
4324#define STRINGLIB_STR PyUnicode_AS_UNICODE
4325
4326Py_LOCAL_INLINE(int)
4327STRINGLIB_CMP(const Py_UNICODE* str, const Py_UNICODE* other, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004328{
Thomas Wouters477c8d52006-05-27 19:21:47 +00004329 if (str[0] != other[0])
4330 return 1;
4331 return memcmp((void*) str, (void*) other, len * sizeof(Py_UNICODE));
Guido van Rossumd57fd912000-03-10 22:53:23 +00004332}
4333
Thomas Wouters477c8d52006-05-27 19:21:47 +00004334#define STRINGLIB_EMPTY unicode_empty
4335
4336#include "stringlib/fastsearch.h"
4337
4338#include "stringlib/count.h"
4339#include "stringlib/find.h"
4340#include "stringlib/partition.h"
4341
4342/* helper macro to fixup start/end slice values */
4343#define FIX_START_END(obj) \
4344 if (start < 0) \
4345 start += (obj)->length; \
4346 if (start < 0) \
4347 start = 0; \
4348 if (end > (obj)->length) \
4349 end = (obj)->length; \
4350 if (end < 0) \
4351 end += (obj)->length; \
4352 if (end < 0) \
4353 end = 0;
4354
Martin v. Löwis18e16552006-02-15 17:27:45 +00004355Py_ssize_t PyUnicode_Count(PyObject *str,
Thomas Wouters477c8d52006-05-27 19:21:47 +00004356 PyObject *substr,
4357 Py_ssize_t start,
4358 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004359{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004360 Py_ssize_t result;
Thomas Wouters477c8d52006-05-27 19:21:47 +00004361 PyUnicodeObject* str_obj;
4362 PyUnicodeObject* sub_obj;
Tim Petersced69f82003-09-16 20:30:58 +00004363
Thomas Wouters477c8d52006-05-27 19:21:47 +00004364 str_obj = (PyUnicodeObject*) PyUnicode_FromObject(str);
4365 if (!str_obj)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004366 return -1;
Thomas Wouters477c8d52006-05-27 19:21:47 +00004367 sub_obj = (PyUnicodeObject*) PyUnicode_FromObject(substr);
4368 if (!sub_obj) {
4369 Py_DECREF(str_obj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004370 return -1;
4371 }
Tim Petersced69f82003-09-16 20:30:58 +00004372
Thomas Wouters477c8d52006-05-27 19:21:47 +00004373 FIX_START_END(str_obj);
Tim Petersced69f82003-09-16 20:30:58 +00004374
Thomas Wouters477c8d52006-05-27 19:21:47 +00004375 result = stringlib_count(
4376 str_obj->str + start, end - start, sub_obj->str, sub_obj->length
4377 );
4378
4379 Py_DECREF(sub_obj);
4380 Py_DECREF(str_obj);
4381
Guido van Rossumd57fd912000-03-10 22:53:23 +00004382 return result;
4383}
4384
Martin v. Löwis18e16552006-02-15 17:27:45 +00004385Py_ssize_t PyUnicode_Find(PyObject *str,
Thomas Wouters477c8d52006-05-27 19:21:47 +00004386 PyObject *sub,
4387 Py_ssize_t start,
4388 Py_ssize_t end,
4389 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004390{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004391 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00004392
Guido van Rossumd57fd912000-03-10 22:53:23 +00004393 str = PyUnicode_FromObject(str);
Thomas Wouters477c8d52006-05-27 19:21:47 +00004394 if (!str)
Marc-André Lemburg4da6fd62002-05-29 11:33:13 +00004395 return -2;
Thomas Wouters477c8d52006-05-27 19:21:47 +00004396 sub = PyUnicode_FromObject(sub);
4397 if (!sub) {
Marc-André Lemburg41644392002-05-29 13:46:29 +00004398 Py_DECREF(str);
Marc-André Lemburg4da6fd62002-05-29 11:33:13 +00004399 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004400 }
Tim Petersced69f82003-09-16 20:30:58 +00004401
Thomas Wouters477c8d52006-05-27 19:21:47 +00004402 if (direction > 0)
4403 result = stringlib_find_slice(
4404 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
4405 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
4406 start, end
4407 );
4408 else
4409 result = stringlib_rfind_slice(
4410 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
4411 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
4412 start, end
4413 );
4414
Guido van Rossumd57fd912000-03-10 22:53:23 +00004415 Py_DECREF(str);
Thomas Wouters477c8d52006-05-27 19:21:47 +00004416 Py_DECREF(sub);
4417
Guido van Rossumd57fd912000-03-10 22:53:23 +00004418 return result;
4419}
4420
Tim Petersced69f82003-09-16 20:30:58 +00004421static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004422int tailmatch(PyUnicodeObject *self,
4423 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004424 Py_ssize_t start,
4425 Py_ssize_t end,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004426 int direction)
4427{
Guido van Rossumd57fd912000-03-10 22:53:23 +00004428 if (substring->length == 0)
4429 return 1;
4430
Thomas Wouters477c8d52006-05-27 19:21:47 +00004431 FIX_START_END(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004432
4433 end -= substring->length;
4434 if (end < start)
4435 return 0;
4436
4437 if (direction > 0) {
4438 if (Py_UNICODE_MATCH(self, end, substring))
4439 return 1;
4440 } else {
4441 if (Py_UNICODE_MATCH(self, start, substring))
4442 return 1;
4443 }
4444
4445 return 0;
4446}
4447
Martin v. Löwis18e16552006-02-15 17:27:45 +00004448Py_ssize_t PyUnicode_Tailmatch(PyObject *str,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004449 PyObject *substr,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004450 Py_ssize_t start,
4451 Py_ssize_t end,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004452 int direction)
4453{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004454 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00004455
Guido van Rossumd57fd912000-03-10 22:53:23 +00004456 str = PyUnicode_FromObject(str);
4457 if (str == NULL)
4458 return -1;
4459 substr = PyUnicode_FromObject(substr);
4460 if (substr == NULL) {
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00004461 Py_DECREF(str);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004462 return -1;
4463 }
Tim Petersced69f82003-09-16 20:30:58 +00004464
Guido van Rossumd57fd912000-03-10 22:53:23 +00004465 result = tailmatch((PyUnicodeObject *)str,
4466 (PyUnicodeObject *)substr,
4467 start, end, direction);
4468 Py_DECREF(str);
4469 Py_DECREF(substr);
4470 return result;
4471}
4472
Guido van Rossumd57fd912000-03-10 22:53:23 +00004473/* Apply fixfct filter to the Unicode object self and return a
4474 reference to the modified object */
4475
Tim Petersced69f82003-09-16 20:30:58 +00004476static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004477PyObject *fixup(PyUnicodeObject *self,
4478 int (*fixfct)(PyUnicodeObject *s))
4479{
4480
4481 PyUnicodeObject *u;
4482
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004483 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004484 if (u == NULL)
4485 return NULL;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004486
4487 Py_UNICODE_COPY(u->str, self->str, self->length);
4488
Tim Peters7a29bd52001-09-12 03:03:31 +00004489 if (!fixfct(u) && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004490 /* fixfct should return TRUE if it modified the buffer. If
4491 FALSE, return a reference to the original buffer instead
4492 (to save space, not time) */
4493 Py_INCREF(self);
4494 Py_DECREF(u);
4495 return (PyObject*) self;
4496 }
4497 return (PyObject*) u;
4498}
4499
Tim Petersced69f82003-09-16 20:30:58 +00004500static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004501int fixupper(PyUnicodeObject *self)
4502{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004503 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004504 Py_UNICODE *s = self->str;
4505 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00004506
Guido van Rossumd57fd912000-03-10 22:53:23 +00004507 while (len-- > 0) {
4508 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00004509
Guido van Rossumd57fd912000-03-10 22:53:23 +00004510 ch = Py_UNICODE_TOUPPER(*s);
4511 if (ch != *s) {
4512 status = 1;
4513 *s = ch;
4514 }
4515 s++;
4516 }
4517
4518 return status;
4519}
4520
Tim Petersced69f82003-09-16 20:30:58 +00004521static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004522int fixlower(PyUnicodeObject *self)
4523{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004524 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004525 Py_UNICODE *s = self->str;
4526 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00004527
Guido van Rossumd57fd912000-03-10 22:53:23 +00004528 while (len-- > 0) {
4529 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00004530
Guido van Rossumd57fd912000-03-10 22:53:23 +00004531 ch = Py_UNICODE_TOLOWER(*s);
4532 if (ch != *s) {
4533 status = 1;
4534 *s = ch;
4535 }
4536 s++;
4537 }
4538
4539 return status;
4540}
4541
Tim Petersced69f82003-09-16 20:30:58 +00004542static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004543int fixswapcase(PyUnicodeObject *self)
4544{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004545 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004546 Py_UNICODE *s = self->str;
4547 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00004548
Guido van Rossumd57fd912000-03-10 22:53:23 +00004549 while (len-- > 0) {
4550 if (Py_UNICODE_ISUPPER(*s)) {
4551 *s = Py_UNICODE_TOLOWER(*s);
4552 status = 1;
4553 } else if (Py_UNICODE_ISLOWER(*s)) {
4554 *s = Py_UNICODE_TOUPPER(*s);
4555 status = 1;
4556 }
4557 s++;
4558 }
4559
4560 return status;
4561}
4562
Tim Petersced69f82003-09-16 20:30:58 +00004563static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004564int fixcapitalize(PyUnicodeObject *self)
4565{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004566 Py_ssize_t len = self->length;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00004567 Py_UNICODE *s = self->str;
4568 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00004569
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00004570 if (len == 0)
4571 return 0;
4572 if (Py_UNICODE_ISLOWER(*s)) {
4573 *s = Py_UNICODE_TOUPPER(*s);
4574 status = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004575 }
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00004576 s++;
4577 while (--len > 0) {
4578 if (Py_UNICODE_ISUPPER(*s)) {
4579 *s = Py_UNICODE_TOLOWER(*s);
4580 status = 1;
4581 }
4582 s++;
4583 }
4584 return status;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004585}
4586
4587static
4588int fixtitle(PyUnicodeObject *self)
4589{
4590 register Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
4591 register Py_UNICODE *e;
4592 int previous_is_cased;
4593
4594 /* Shortcut for single character strings */
4595 if (PyUnicode_GET_SIZE(self) == 1) {
4596 Py_UNICODE ch = Py_UNICODE_TOTITLE(*p);
4597 if (*p != ch) {
4598 *p = ch;
4599 return 1;
4600 }
4601 else
4602 return 0;
4603 }
Tim Petersced69f82003-09-16 20:30:58 +00004604
Guido van Rossumd57fd912000-03-10 22:53:23 +00004605 e = p + PyUnicode_GET_SIZE(self);
4606 previous_is_cased = 0;
4607 for (; p < e; p++) {
4608 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00004609
Guido van Rossumd57fd912000-03-10 22:53:23 +00004610 if (previous_is_cased)
4611 *p = Py_UNICODE_TOLOWER(ch);
4612 else
4613 *p = Py_UNICODE_TOTITLE(ch);
Tim Petersced69f82003-09-16 20:30:58 +00004614
4615 if (Py_UNICODE_ISLOWER(ch) ||
4616 Py_UNICODE_ISUPPER(ch) ||
Guido van Rossumd57fd912000-03-10 22:53:23 +00004617 Py_UNICODE_ISTITLE(ch))
4618 previous_is_cased = 1;
4619 else
4620 previous_is_cased = 0;
4621 }
4622 return 1;
4623}
4624
Tim Peters8ce9f162004-08-27 01:49:32 +00004625PyObject *
4626PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004627{
Tim Peters8ce9f162004-08-27 01:49:32 +00004628 PyObject *internal_separator = NULL;
Skip Montanaro6543b452004-09-16 03:28:13 +00004629 const Py_UNICODE blank = ' ';
4630 const Py_UNICODE *sep = &blank;
Thomas Wouters477c8d52006-05-27 19:21:47 +00004631 Py_ssize_t seplen = 1;
Tim Peters05eba1f2004-08-27 21:32:02 +00004632 PyUnicodeObject *res = NULL; /* the result */
Thomas Wouters477c8d52006-05-27 19:21:47 +00004633 Py_ssize_t res_alloc = 100; /* # allocated bytes for string in res */
4634 Py_ssize_t res_used; /* # used bytes */
Tim Peters05eba1f2004-08-27 21:32:02 +00004635 Py_UNICODE *res_p; /* pointer to free byte in res's string area */
4636 PyObject *fseq; /* PySequence_Fast(seq) */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004637 Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */
Tim Peters8ce9f162004-08-27 01:49:32 +00004638 PyObject *item;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004639 Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004640
Tim Peters05eba1f2004-08-27 21:32:02 +00004641 fseq = PySequence_Fast(seq, "");
4642 if (fseq == NULL) {
Tim Peters05eba1f2004-08-27 21:32:02 +00004643 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +00004644 }
4645
Tim Peters91879ab2004-08-27 22:35:44 +00004646 /* Grrrr. A codec may be invoked to convert str objects to
4647 * Unicode, and so it's possible to call back into Python code
4648 * during PyUnicode_FromObject(), and so it's possible for a sick
4649 * codec to change the size of fseq (if seq is a list). Therefore
4650 * we have to keep refetching the size -- can't assume seqlen
4651 * is invariant.
4652 */
Tim Peters05eba1f2004-08-27 21:32:02 +00004653 seqlen = PySequence_Fast_GET_SIZE(fseq);
4654 /* If empty sequence, return u"". */
4655 if (seqlen == 0) {
4656 res = _PyUnicode_New(0); /* empty sequence; return u"" */
4657 goto Done;
4658 }
4659 /* If singleton sequence with an exact Unicode, return that. */
4660 if (seqlen == 1) {
4661 item = PySequence_Fast_GET_ITEM(fseq, 0);
4662 if (PyUnicode_CheckExact(item)) {
4663 Py_INCREF(item);
4664 res = (PyUnicodeObject *)item;
4665 goto Done;
4666 }
Tim Peters8ce9f162004-08-27 01:49:32 +00004667 }
4668
Tim Peters05eba1f2004-08-27 21:32:02 +00004669 /* At least two items to join, or one that isn't exact Unicode. */
4670 if (seqlen > 1) {
4671 /* Set up sep and seplen -- they're needed. */
4672 if (separator == NULL) {
4673 sep = &blank;
4674 seplen = 1;
4675 }
4676 else {
4677 internal_separator = PyUnicode_FromObject(separator);
4678 if (internal_separator == NULL)
4679 goto onError;
4680 sep = PyUnicode_AS_UNICODE(internal_separator);
4681 seplen = PyUnicode_GET_SIZE(internal_separator);
Tim Peters91879ab2004-08-27 22:35:44 +00004682 /* In case PyUnicode_FromObject() mutated seq. */
4683 seqlen = PySequence_Fast_GET_SIZE(fseq);
Tim Peters05eba1f2004-08-27 21:32:02 +00004684 }
4685 }
4686
4687 /* Get space. */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004688 res = _PyUnicode_New(res_alloc);
Tim Peters05eba1f2004-08-27 21:32:02 +00004689 if (res == NULL)
Tim Peters8ce9f162004-08-27 01:49:32 +00004690 goto onError;
Tim Peters05eba1f2004-08-27 21:32:02 +00004691 res_p = PyUnicode_AS_UNICODE(res);
4692 res_used = 0;
Tim Peters8ce9f162004-08-27 01:49:32 +00004693
Tim Peters05eba1f2004-08-27 21:32:02 +00004694 for (i = 0; i < seqlen; ++i) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00004695 Py_ssize_t itemlen;
4696 Py_ssize_t new_res_used;
Tim Peters05eba1f2004-08-27 21:32:02 +00004697
4698 item = PySequence_Fast_GET_ITEM(fseq, i);
4699 /* Convert item to Unicode. */
4700 if (! PyUnicode_Check(item) && ! PyString_Check(item)) {
4701 PyErr_Format(PyExc_TypeError,
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004702 "sequence item %zd: expected string or Unicode,"
Tim Peters05eba1f2004-08-27 21:32:02 +00004703 " %.80s found",
4704 i, item->ob_type->tp_name);
Tim Peters2cfe3682001-05-05 05:36:48 +00004705 goto onError;
Tim Peters8ce9f162004-08-27 01:49:32 +00004706 }
Tim Peters05eba1f2004-08-27 21:32:02 +00004707 item = PyUnicode_FromObject(item);
4708 if (item == NULL)
4709 goto onError;
4710 /* We own a reference to item from here on. */
Tim Petersced69f82003-09-16 20:30:58 +00004711
Tim Peters91879ab2004-08-27 22:35:44 +00004712 /* In case PyUnicode_FromObject() mutated seq. */
4713 seqlen = PySequence_Fast_GET_SIZE(fseq);
4714
Tim Peters8ce9f162004-08-27 01:49:32 +00004715 /* Make sure we have enough space for the separator and the item. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004716 itemlen = PyUnicode_GET_SIZE(item);
Tim Peters05eba1f2004-08-27 21:32:02 +00004717 new_res_used = res_used + itemlen;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004718 if (new_res_used < 0)
Tim Peters8ce9f162004-08-27 01:49:32 +00004719 goto Overflow;
Tim Peters05eba1f2004-08-27 21:32:02 +00004720 if (i < seqlen - 1) {
4721 new_res_used += seplen;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004722 if (new_res_used < 0)
Tim Peters05eba1f2004-08-27 21:32:02 +00004723 goto Overflow;
4724 }
4725 if (new_res_used > res_alloc) {
4726 /* double allocated size until it's big enough */
Tim Peters8ce9f162004-08-27 01:49:32 +00004727 do {
Tim Peters05eba1f2004-08-27 21:32:02 +00004728 res_alloc += res_alloc;
Thomas Wouters477c8d52006-05-27 19:21:47 +00004729 if (res_alloc <= 0)
Tim Peters8ce9f162004-08-27 01:49:32 +00004730 goto Overflow;
Tim Peters05eba1f2004-08-27 21:32:02 +00004731 } while (new_res_used > res_alloc);
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004732 if (_PyUnicode_Resize(&res, res_alloc) < 0) {
Marc-André Lemburg3508e302001-09-20 17:22:58 +00004733 Py_DECREF(item);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004734 goto onError;
Marc-André Lemburg3508e302001-09-20 17:22:58 +00004735 }
Tim Peters05eba1f2004-08-27 21:32:02 +00004736 res_p = PyUnicode_AS_UNICODE(res) + res_used;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004737 }
Tim Peters05eba1f2004-08-27 21:32:02 +00004738
4739 /* Copy item, and maybe the separator. */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004740 Py_UNICODE_COPY(res_p, PyUnicode_AS_UNICODE(item), itemlen);
Tim Peters05eba1f2004-08-27 21:32:02 +00004741 res_p += itemlen;
4742 if (i < seqlen - 1) {
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004743 Py_UNICODE_COPY(res_p, sep, seplen);
Tim Peters05eba1f2004-08-27 21:32:02 +00004744 res_p += seplen;
4745 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004746 Py_DECREF(item);
Tim Peters05eba1f2004-08-27 21:32:02 +00004747 res_used = new_res_used;
4748 }
Tim Peters8ce9f162004-08-27 01:49:32 +00004749
Tim Peters05eba1f2004-08-27 21:32:02 +00004750 /* Shrink res to match the used area; this probably can't fail,
4751 * but it's cheap to check.
4752 */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004753 if (_PyUnicode_Resize(&res, res_used) < 0)
Tim Peters8ce9f162004-08-27 01:49:32 +00004754 goto onError;
4755
4756 Done:
4757 Py_XDECREF(internal_separator);
Tim Peters05eba1f2004-08-27 21:32:02 +00004758 Py_DECREF(fseq);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004759 return (PyObject *)res;
4760
Tim Peters8ce9f162004-08-27 01:49:32 +00004761 Overflow:
4762 PyErr_SetString(PyExc_OverflowError,
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004763 "join() result is too long for a Python string");
Tim Peters8ce9f162004-08-27 01:49:32 +00004764 Py_DECREF(item);
4765 /* fall through */
4766
Guido van Rossumd57fd912000-03-10 22:53:23 +00004767 onError:
Tim Peters8ce9f162004-08-27 01:49:32 +00004768 Py_XDECREF(internal_separator);
Tim Peters05eba1f2004-08-27 21:32:02 +00004769 Py_DECREF(fseq);
Tim Peters8ce9f162004-08-27 01:49:32 +00004770 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004771 return NULL;
4772}
4773
Tim Petersced69f82003-09-16 20:30:58 +00004774static
4775PyUnicodeObject *pad(PyUnicodeObject *self,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004776 Py_ssize_t left,
4777 Py_ssize_t right,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004778 Py_UNICODE fill)
4779{
4780 PyUnicodeObject *u;
4781
4782 if (left < 0)
4783 left = 0;
4784 if (right < 0)
4785 right = 0;
4786
Tim Peters7a29bd52001-09-12 03:03:31 +00004787 if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004788 Py_INCREF(self);
4789 return self;
4790 }
4791
4792 u = _PyUnicode_New(left + self->length + right);
4793 if (u) {
4794 if (left)
4795 Py_UNICODE_FILL(u->str, fill, left);
4796 Py_UNICODE_COPY(u->str + left, self->str, self->length);
4797 if (right)
4798 Py_UNICODE_FILL(u->str + left + self->length, fill, right);
4799 }
4800
4801 return u;
4802}
4803
4804#define SPLIT_APPEND(data, left, right) \
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004805 str = PyUnicode_FromUnicode((data) + (left), (right) - (left)); \
Guido van Rossumd57fd912000-03-10 22:53:23 +00004806 if (!str) \
4807 goto onError; \
4808 if (PyList_Append(list, str)) { \
4809 Py_DECREF(str); \
4810 goto onError; \
4811 } \
4812 else \
4813 Py_DECREF(str);
4814
4815static
4816PyObject *split_whitespace(PyUnicodeObject *self,
4817 PyObject *list,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004818 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004819{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004820 register Py_ssize_t i;
4821 register Py_ssize_t j;
4822 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004823 PyObject *str;
4824
4825 for (i = j = 0; i < len; ) {
4826 /* find a token */
4827 while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
4828 i++;
4829 j = i;
4830 while (i < len && !Py_UNICODE_ISSPACE(self->str[i]))
4831 i++;
4832 if (j < i) {
4833 if (maxcount-- <= 0)
4834 break;
4835 SPLIT_APPEND(self->str, j, i);
4836 while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
4837 i++;
4838 j = i;
4839 }
4840 }
4841 if (j < len) {
4842 SPLIT_APPEND(self->str, j, len);
4843 }
4844 return list;
4845
4846 onError:
4847 Py_DECREF(list);
4848 return NULL;
4849}
4850
4851PyObject *PyUnicode_Splitlines(PyObject *string,
Guido van Rossum86662912000-04-11 15:38:46 +00004852 int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004853{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004854 register Py_ssize_t i;
4855 register Py_ssize_t j;
4856 Py_ssize_t len;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004857 PyObject *list;
4858 PyObject *str;
4859 Py_UNICODE *data;
4860
4861 string = PyUnicode_FromObject(string);
4862 if (string == NULL)
4863 return NULL;
4864 data = PyUnicode_AS_UNICODE(string);
4865 len = PyUnicode_GET_SIZE(string);
4866
Guido van Rossumd57fd912000-03-10 22:53:23 +00004867 list = PyList_New(0);
4868 if (!list)
4869 goto onError;
4870
4871 for (i = j = 0; i < len; ) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00004872 Py_ssize_t eol;
Tim Petersced69f82003-09-16 20:30:58 +00004873
Guido van Rossumd57fd912000-03-10 22:53:23 +00004874 /* Find a line and append it */
Thomas Wouters477c8d52006-05-27 19:21:47 +00004875 while (i < len && !BLOOM_LINEBREAK(data[i]))
Guido van Rossumd57fd912000-03-10 22:53:23 +00004876 i++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004877
4878 /* Skip the line break reading CRLF as one line break */
Guido van Rossum86662912000-04-11 15:38:46 +00004879 eol = i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004880 if (i < len) {
4881 if (data[i] == '\r' && i + 1 < len &&
4882 data[i+1] == '\n')
4883 i += 2;
4884 else
4885 i++;
Guido van Rossum86662912000-04-11 15:38:46 +00004886 if (keepends)
4887 eol = i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004888 }
Guido van Rossum86662912000-04-11 15:38:46 +00004889 SPLIT_APPEND(data, j, eol);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004890 j = i;
4891 }
4892 if (j < len) {
4893 SPLIT_APPEND(data, j, len);
4894 }
4895
4896 Py_DECREF(string);
4897 return list;
4898
4899 onError:
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00004900 Py_XDECREF(list);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004901 Py_DECREF(string);
4902 return NULL;
4903}
4904
Tim Petersced69f82003-09-16 20:30:58 +00004905static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004906PyObject *split_char(PyUnicodeObject *self,
4907 PyObject *list,
4908 Py_UNICODE ch,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004909 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004910{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004911 register Py_ssize_t i;
4912 register Py_ssize_t j;
4913 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004914 PyObject *str;
4915
4916 for (i = j = 0; i < len; ) {
4917 if (self->str[i] == ch) {
4918 if (maxcount-- <= 0)
4919 break;
4920 SPLIT_APPEND(self->str, j, i);
4921 i = j = i + 1;
4922 } else
4923 i++;
4924 }
4925 if (j <= len) {
4926 SPLIT_APPEND(self->str, j, len);
4927 }
4928 return list;
4929
4930 onError:
4931 Py_DECREF(list);
4932 return NULL;
4933}
4934
Tim Petersced69f82003-09-16 20:30:58 +00004935static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004936PyObject *split_substring(PyUnicodeObject *self,
4937 PyObject *list,
4938 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004939 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004940{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004941 register Py_ssize_t i;
4942 register Py_ssize_t j;
4943 Py_ssize_t len = self->length;
4944 Py_ssize_t sublen = substring->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004945 PyObject *str;
4946
Guido van Rossumcda4f9a2000-12-19 02:23:19 +00004947 for (i = j = 0; i <= len - sublen; ) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004948 if (Py_UNICODE_MATCH(self, i, substring)) {
4949 if (maxcount-- <= 0)
4950 break;
4951 SPLIT_APPEND(self->str, j, i);
4952 i = j = i + sublen;
4953 } else
4954 i++;
4955 }
4956 if (j <= len) {
4957 SPLIT_APPEND(self->str, j, len);
4958 }
4959 return list;
4960
4961 onError:
4962 Py_DECREF(list);
4963 return NULL;
4964}
4965
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004966static
4967PyObject *rsplit_whitespace(PyUnicodeObject *self,
4968 PyObject *list,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004969 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004970{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004971 register Py_ssize_t i;
4972 register Py_ssize_t j;
4973 Py_ssize_t len = self->length;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004974 PyObject *str;
4975
4976 for (i = j = len - 1; i >= 0; ) {
4977 /* find a token */
4978 while (i >= 0 && Py_UNICODE_ISSPACE(self->str[i]))
4979 i--;
4980 j = i;
4981 while (i >= 0 && !Py_UNICODE_ISSPACE(self->str[i]))
4982 i--;
4983 if (j > i) {
4984 if (maxcount-- <= 0)
4985 break;
Thomas Wouters477c8d52006-05-27 19:21:47 +00004986 SPLIT_APPEND(self->str, i + 1, j + 1);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004987 while (i >= 0 && Py_UNICODE_ISSPACE(self->str[i]))
4988 i--;
4989 j = i;
4990 }
4991 }
4992 if (j >= 0) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00004993 SPLIT_APPEND(self->str, 0, j + 1);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004994 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00004995 if (PyList_Reverse(list) < 0)
4996 goto onError;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004997 return list;
4998
4999 onError:
5000 Py_DECREF(list);
5001 return NULL;
5002}
5003
5004static
5005PyObject *rsplit_char(PyUnicodeObject *self,
5006 PyObject *list,
5007 Py_UNICODE ch,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005008 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005009{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005010 register Py_ssize_t i;
5011 register Py_ssize_t j;
5012 Py_ssize_t len = self->length;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005013 PyObject *str;
5014
5015 for (i = j = len - 1; i >= 0; ) {
5016 if (self->str[i] == ch) {
5017 if (maxcount-- <= 0)
5018 break;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005019 SPLIT_APPEND(self->str, i + 1, j + 1);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005020 j = i = i - 1;
5021 } else
5022 i--;
5023 }
Hye-Shik Chang7fc4cf52003-12-23 09:10:16 +00005024 if (j >= -1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00005025 SPLIT_APPEND(self->str, 0, j + 1);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005026 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00005027 if (PyList_Reverse(list) < 0)
5028 goto onError;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005029 return list;
5030
5031 onError:
5032 Py_DECREF(list);
5033 return NULL;
5034}
5035
5036static
5037PyObject *rsplit_substring(PyUnicodeObject *self,
5038 PyObject *list,
5039 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005040 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005041{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005042 register Py_ssize_t i;
5043 register Py_ssize_t j;
5044 Py_ssize_t len = self->length;
5045 Py_ssize_t sublen = substring->length;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005046 PyObject *str;
5047
5048 for (i = len - sublen, j = len; i >= 0; ) {
5049 if (Py_UNICODE_MATCH(self, i, substring)) {
5050 if (maxcount-- <= 0)
5051 break;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005052 SPLIT_APPEND(self->str, i + sublen, j);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005053 j = i;
5054 i -= sublen;
5055 } else
5056 i--;
5057 }
5058 if (j >= 0) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00005059 SPLIT_APPEND(self->str, 0, j);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005060 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00005061 if (PyList_Reverse(list) < 0)
5062 goto onError;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005063 return list;
5064
5065 onError:
5066 Py_DECREF(list);
5067 return NULL;
5068}
5069
Guido van Rossumd57fd912000-03-10 22:53:23 +00005070#undef SPLIT_APPEND
5071
5072static
5073PyObject *split(PyUnicodeObject *self,
5074 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005075 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005076{
5077 PyObject *list;
5078
5079 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005080 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005081
5082 list = PyList_New(0);
5083 if (!list)
5084 return NULL;
5085
5086 if (substring == NULL)
5087 return split_whitespace(self,list,maxcount);
5088
5089 else if (substring->length == 1)
5090 return split_char(self,list,substring->str[0],maxcount);
5091
5092 else if (substring->length == 0) {
5093 Py_DECREF(list);
5094 PyErr_SetString(PyExc_ValueError, "empty separator");
5095 return NULL;
5096 }
5097 else
5098 return split_substring(self,list,substring,maxcount);
5099}
5100
Tim Petersced69f82003-09-16 20:30:58 +00005101static
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005102PyObject *rsplit(PyUnicodeObject *self,
5103 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005104 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005105{
5106 PyObject *list;
5107
5108 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005109 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005110
5111 list = PyList_New(0);
5112 if (!list)
5113 return NULL;
5114
5115 if (substring == NULL)
5116 return rsplit_whitespace(self,list,maxcount);
5117
5118 else if (substring->length == 1)
5119 return rsplit_char(self,list,substring->str[0],maxcount);
5120
5121 else if (substring->length == 0) {
5122 Py_DECREF(list);
5123 PyErr_SetString(PyExc_ValueError, "empty separator");
5124 return NULL;
5125 }
5126 else
5127 return rsplit_substring(self,list,substring,maxcount);
5128}
5129
5130static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005131PyObject *replace(PyUnicodeObject *self,
5132 PyUnicodeObject *str1,
5133 PyUnicodeObject *str2,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005134 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005135{
5136 PyUnicodeObject *u;
5137
5138 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005139 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005140
Thomas Wouters477c8d52006-05-27 19:21:47 +00005141 if (str1->length == str2->length) {
5142 /* same length */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005143 Py_ssize_t i;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005144 if (str1->length == 1) {
5145 /* replace characters */
5146 Py_UNICODE u1, u2;
5147 if (!findchar(self->str, self->length, str1->str[0]))
5148 goto nothing;
5149 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
5150 if (!u)
5151 return NULL;
5152 Py_UNICODE_COPY(u->str, self->str, self->length);
5153 u1 = str1->str[0];
5154 u2 = str2->str[0];
5155 for (i = 0; i < u->length; i++)
5156 if (u->str[i] == u1) {
5157 if (--maxcount < 0)
5158 break;
5159 u->str[i] = u2;
5160 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005161 } else {
Thomas Wouters477c8d52006-05-27 19:21:47 +00005162 i = fastsearch(
5163 self->str, self->length, str1->str, str1->length, FAST_SEARCH
Guido van Rossumd57fd912000-03-10 22:53:23 +00005164 );
Thomas Wouters477c8d52006-05-27 19:21:47 +00005165 if (i < 0)
5166 goto nothing;
5167 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
5168 if (!u)
5169 return NULL;
5170 Py_UNICODE_COPY(u->str, self->str, self->length);
5171 while (i <= self->length - str1->length)
5172 if (Py_UNICODE_MATCH(self, i, str1)) {
5173 if (--maxcount < 0)
5174 break;
5175 Py_UNICODE_COPY(u->str+i, str2->str, str2->length);
5176 i += str1->length;
5177 } else
5178 i++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005179 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005180 } else {
Thomas Wouters477c8d52006-05-27 19:21:47 +00005181
5182 Py_ssize_t n, i, j, e;
5183 Py_ssize_t product, new_size, delta;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005184 Py_UNICODE *p;
5185
5186 /* replace strings */
Thomas Wouters477c8d52006-05-27 19:21:47 +00005187 n = stringlib_count(self->str, self->length, str1->str, str1->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005188 if (n > maxcount)
5189 n = maxcount;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005190 if (n == 0)
5191 goto nothing;
5192 /* new_size = self->length + n * (str2->length - str1->length)); */
5193 delta = (str2->length - str1->length);
5194 if (delta == 0) {
5195 new_size = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005196 } else {
Thomas Wouters477c8d52006-05-27 19:21:47 +00005197 product = n * (str2->length - str1->length);
5198 if ((product / (str2->length - str1->length)) != n) {
5199 PyErr_SetString(PyExc_OverflowError,
5200 "replace string is too long");
5201 return NULL;
5202 }
5203 new_size = self->length + product;
5204 if (new_size < 0) {
5205 PyErr_SetString(PyExc_OverflowError,
5206 "replace string is too long");
5207 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005208 }
5209 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00005210 u = _PyUnicode_New(new_size);
5211 if (!u)
5212 return NULL;
5213 i = 0;
5214 p = u->str;
5215 e = self->length - str1->length;
5216 if (str1->length > 0) {
5217 while (n-- > 0) {
5218 /* look for next match */
5219 j = i;
5220 while (j <= e) {
5221 if (Py_UNICODE_MATCH(self, j, str1))
5222 break;
5223 j++;
5224 }
5225 if (j > i) {
5226 if (j > e)
5227 break;
5228 /* copy unchanged part [i:j] */
5229 Py_UNICODE_COPY(p, self->str+i, j-i);
5230 p += j - i;
5231 }
5232 /* copy substitution string */
5233 if (str2->length > 0) {
5234 Py_UNICODE_COPY(p, str2->str, str2->length);
5235 p += str2->length;
5236 }
5237 i = j + str1->length;
5238 }
5239 if (i < self->length)
5240 /* copy tail [i:] */
5241 Py_UNICODE_COPY(p, self->str+i, self->length-i);
5242 } else {
5243 /* interleave */
5244 while (n > 0) {
5245 Py_UNICODE_COPY(p, str2->str, str2->length);
5246 p += str2->length;
5247 if (--n <= 0)
5248 break;
5249 *p++ = self->str[i++];
5250 }
5251 Py_UNICODE_COPY(p, self->str+i, self->length-i);
5252 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005253 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005254 return (PyObject *) u;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005255
5256nothing:
5257 /* nothing to replace; return original string (when possible) */
5258 if (PyUnicode_CheckExact(self)) {
5259 Py_INCREF(self);
5260 return (PyObject *) self;
5261 }
5262 return PyUnicode_FromUnicode(self->str, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005263}
5264
5265/* --- Unicode Object Methods --------------------------------------------- */
5266
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005267PyDoc_STRVAR(title__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005268"S.title() -> unicode\n\
5269\n\
5270Return a titlecased version of S, i.e. words start with title case\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005271characters, all remaining cased characters have lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005272
5273static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005274unicode_title(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005275{
Guido van Rossumd57fd912000-03-10 22:53:23 +00005276 return fixup(self, fixtitle);
5277}
5278
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005279PyDoc_STRVAR(capitalize__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005280"S.capitalize() -> unicode\n\
5281\n\
5282Return a capitalized version of S, i.e. make the first character\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005283have upper case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005284
5285static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005286unicode_capitalize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005287{
Guido van Rossumd57fd912000-03-10 22:53:23 +00005288 return fixup(self, fixcapitalize);
5289}
5290
5291#if 0
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005292PyDoc_STRVAR(capwords__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005293"S.capwords() -> unicode\n\
5294\n\
5295Apply .capitalize() to all words in S and return the result with\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005296normalized whitespace (all whitespace strings are replaced by ' ').");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005297
5298static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005299unicode_capwords(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005300{
5301 PyObject *list;
5302 PyObject *item;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005303 Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005304
Guido van Rossumd57fd912000-03-10 22:53:23 +00005305 /* Split into words */
5306 list = split(self, NULL, -1);
5307 if (!list)
5308 return NULL;
5309
5310 /* Capitalize each word */
5311 for (i = 0; i < PyList_GET_SIZE(list); i++) {
5312 item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
5313 fixcapitalize);
5314 if (item == NULL)
5315 goto onError;
5316 Py_DECREF(PyList_GET_ITEM(list, i));
5317 PyList_SET_ITEM(list, i, item);
5318 }
5319
5320 /* Join the words to form a new string */
5321 item = PyUnicode_Join(NULL, list);
5322
5323onError:
5324 Py_DECREF(list);
5325 return (PyObject *)item;
5326}
5327#endif
5328
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00005329/* Argument converter. Coerces to a single unicode character */
5330
5331static int
5332convert_uc(PyObject *obj, void *addr)
5333{
5334 Py_UNICODE *fillcharloc = (Py_UNICODE *)addr;
5335 PyObject *uniobj;
5336 Py_UNICODE *unistr;
5337
5338 uniobj = PyUnicode_FromObject(obj);
5339 if (uniobj == NULL) {
5340 PyErr_SetString(PyExc_TypeError,
5341 "The fill character cannot be converted to Unicode");
5342 return 0;
5343 }
5344 if (PyUnicode_GET_SIZE(uniobj) != 1) {
5345 PyErr_SetString(PyExc_TypeError,
5346 "The fill character must be exactly one character long");
5347 Py_DECREF(uniobj);
5348 return 0;
5349 }
5350 unistr = PyUnicode_AS_UNICODE(uniobj);
5351 *fillcharloc = unistr[0];
5352 Py_DECREF(uniobj);
5353 return 1;
5354}
5355
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005356PyDoc_STRVAR(center__doc__,
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00005357"S.center(width[, fillchar]) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005358\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00005359Return S centered in a Unicode string of length width. Padding is\n\
5360done using the specified fill character (default is a space)");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005361
5362static PyObject *
5363unicode_center(PyUnicodeObject *self, PyObject *args)
5364{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005365 Py_ssize_t marg, left;
5366 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00005367 Py_UNICODE fillchar = ' ';
Guido van Rossumd57fd912000-03-10 22:53:23 +00005368
Thomas Woutersde017742006-02-16 19:34:37 +00005369 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005370 return NULL;
5371
Tim Peters7a29bd52001-09-12 03:03:31 +00005372 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005373 Py_INCREF(self);
5374 return (PyObject*) self;
5375 }
5376
5377 marg = width - self->length;
5378 left = marg / 2 + (marg & width & 1);
5379
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00005380 return (PyObject*) pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005381}
5382
Marc-André Lemburge5034372000-08-08 08:04:29 +00005383#if 0
5384
5385/* This code should go into some future Unicode collation support
5386 module. The basic comparison should compare ordinals on a naive
Trent Mick20abf572000-08-12 22:14:34 +00005387 basis (this is what Java does and thus JPython too). */
Marc-André Lemburge5034372000-08-08 08:04:29 +00005388
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005389/* speedy UTF-16 code point order comparison */
5390/* gleaned from: */
5391/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
5392
Marc-André Lemburge12896e2000-07-07 17:51:08 +00005393static short utf16Fixup[32] =
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005394{
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005395 0, 0, 0, 0, 0, 0, 0, 0,
Tim Petersced69f82003-09-16 20:30:58 +00005396 0, 0, 0, 0, 0, 0, 0, 0,
5397 0, 0, 0, 0, 0, 0, 0, 0,
Marc-André Lemburge12896e2000-07-07 17:51:08 +00005398 0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005399};
5400
Guido van Rossumd57fd912000-03-10 22:53:23 +00005401static int
5402unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
5403{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005404 Py_ssize_t len1, len2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005405
Guido van Rossumd57fd912000-03-10 22:53:23 +00005406 Py_UNICODE *s1 = str1->str;
5407 Py_UNICODE *s2 = str2->str;
5408
5409 len1 = str1->length;
5410 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00005411
Guido van Rossumd57fd912000-03-10 22:53:23 +00005412 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00005413 Py_UNICODE c1, c2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005414
5415 c1 = *s1++;
5416 c2 = *s2++;
Fredrik Lundh45714e92001-06-26 16:39:36 +00005417
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005418 if (c1 > (1<<11) * 26)
5419 c1 += utf16Fixup[c1>>11];
5420 if (c2 > (1<<11) * 26)
5421 c2 += utf16Fixup[c2>>11];
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005422 /* now c1 and c2 are in UTF-32-compatible order */
Fredrik Lundh45714e92001-06-26 16:39:36 +00005423
5424 if (c1 != c2)
5425 return (c1 < c2) ? -1 : 1;
Tim Petersced69f82003-09-16 20:30:58 +00005426
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005427 len1--; len2--;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005428 }
5429
5430 return (len1 < len2) ? -1 : (len1 != len2);
5431}
5432
Marc-André Lemburge5034372000-08-08 08:04:29 +00005433#else
5434
5435static int
5436unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
5437{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005438 register Py_ssize_t len1, len2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00005439
5440 Py_UNICODE *s1 = str1->str;
5441 Py_UNICODE *s2 = str2->str;
5442
5443 len1 = str1->length;
5444 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00005445
Marc-André Lemburge5034372000-08-08 08:04:29 +00005446 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00005447 Py_UNICODE c1, c2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00005448
Fredrik Lundh45714e92001-06-26 16:39:36 +00005449 c1 = *s1++;
5450 c2 = *s2++;
5451
5452 if (c1 != c2)
5453 return (c1 < c2) ? -1 : 1;
5454
Marc-André Lemburge5034372000-08-08 08:04:29 +00005455 len1--; len2--;
5456 }
5457
5458 return (len1 < len2) ? -1 : (len1 != len2);
5459}
5460
5461#endif
5462
Guido van Rossumd57fd912000-03-10 22:53:23 +00005463int PyUnicode_Compare(PyObject *left,
5464 PyObject *right)
5465{
Guido van Rossum09dc34f2007-05-04 04:17:33 +00005466 if (PyUnicode_Check(left) && PyUnicode_Check(right))
5467 return unicode_compare((PyUnicodeObject *)left,
5468 (PyUnicodeObject *)right);
5469 if ((PyString_Check(left) && PyUnicode_Check(right)) ||
5470 (PyUnicode_Check(left) && PyString_Check(right))) {
5471 if (PyUnicode_Check(left))
5472 left = _PyUnicode_AsDefaultEncodedString(left, NULL);
5473 if (PyUnicode_Check(right))
5474 right = _PyUnicode_AsDefaultEncodedString(right, NULL);
5475 assert(PyString_Check(left));
5476 assert(PyString_Check(right));
5477 return PyObject_Compare(left, right);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005478 }
Guido van Rossum09dc34f2007-05-04 04:17:33 +00005479 PyErr_Format(PyExc_TypeError,
5480 "Can't compare %.100s and %.100s",
5481 left->ob_type->tp_name,
5482 right->ob_type->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005483 return -1;
5484}
5485
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00005486PyObject *PyUnicode_RichCompare(PyObject *left,
5487 PyObject *right,
5488 int op)
5489{
5490 int result;
5491
5492 result = PyUnicode_Compare(left, right);
5493 if (result == -1 && PyErr_Occurred())
5494 goto onError;
5495
5496 /* Convert the return value to a Boolean */
5497 switch (op) {
5498 case Py_EQ:
5499 result = (result == 0);
5500 break;
5501 case Py_NE:
5502 result = (result != 0);
5503 break;
5504 case Py_LE:
5505 result = (result <= 0);
5506 break;
5507 case Py_GE:
5508 result = (result >= 0);
5509 break;
5510 case Py_LT:
5511 result = (result == -1);
5512 break;
5513 case Py_GT:
5514 result = (result == 1);
5515 break;
5516 }
5517 return PyBool_FromLong(result);
5518
5519 onError:
5520
5521 /* Standard case
5522
5523 Type errors mean that PyUnicode_FromObject() could not convert
5524 one of the arguments (usually the right hand side) to Unicode,
5525 ie. we can't handle the comparison request. However, it is
5526 possible that the other object knows a comparison method, which
5527 is why we return Py_NotImplemented to give the other object a
5528 chance.
5529
5530 */
5531 if (PyErr_ExceptionMatches(PyExc_TypeError)) {
5532 PyErr_Clear();
5533 Py_INCREF(Py_NotImplemented);
5534 return Py_NotImplemented;
5535 }
5536 if (op != Py_EQ && op != Py_NE)
5537 return NULL;
5538
5539 /* Equality comparison.
5540
5541 This is a special case: we silence any PyExc_UnicodeDecodeError
5542 and instead turn it into a PyErr_UnicodeWarning.
5543
5544 */
5545 if (!PyErr_ExceptionMatches(PyExc_UnicodeDecodeError))
5546 return NULL;
5547 PyErr_Clear();
5548 if (PyErr_Warn(PyExc_UnicodeWarning,
5549 (op == Py_EQ) ?
5550 "Unicode equal comparison "
5551 "failed to convert both arguments to Unicode - "
5552 "interpreting them as being unequal" :
5553 "Unicode unequal comparison "
5554 "failed to convert both arguments to Unicode - "
5555 "interpreting them as being unequal"
5556 ) < 0)
5557 return NULL;
5558 result = (op == Py_NE);
5559 return PyBool_FromLong(result);
5560}
5561
Guido van Rossum403d68b2000-03-13 15:55:09 +00005562int PyUnicode_Contains(PyObject *container,
5563 PyObject *element)
5564{
Thomas Wouters477c8d52006-05-27 19:21:47 +00005565 PyObject *str, *sub;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005566 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00005567
5568 /* Coerce the two arguments */
Thomas Wouters477c8d52006-05-27 19:21:47 +00005569 sub = PyUnicode_FromObject(element);
5570 if (!sub) {
Marc-André Lemburg7c014682000-06-28 08:11:47 +00005571 PyErr_SetString(PyExc_TypeError,
Barry Warsaw817918c2002-08-06 16:58:21 +00005572 "'in <string>' requires string as left operand");
Thomas Wouters477c8d52006-05-27 19:21:47 +00005573 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +00005574 }
5575
Thomas Wouters477c8d52006-05-27 19:21:47 +00005576 str = PyUnicode_FromObject(container);
5577 if (!str) {
5578 Py_DECREF(sub);
5579 return -1;
5580 }
5581
5582 result = stringlib_contains_obj(str, sub);
5583
5584 Py_DECREF(str);
5585 Py_DECREF(sub);
5586
Guido van Rossum403d68b2000-03-13 15:55:09 +00005587 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00005588}
5589
Guido van Rossumd57fd912000-03-10 22:53:23 +00005590/* Concat to string or Unicode object giving a new Unicode object. */
5591
5592PyObject *PyUnicode_Concat(PyObject *left,
5593 PyObject *right)
5594{
5595 PyUnicodeObject *u = NULL, *v = NULL, *w;
5596
Guido van Rossum84d79dd2007-04-13 02:23:57 +00005597 if (PyBytes_Check(left) || PyBytes_Check(right))
5598 return PyBytes_Concat(left, right);
5599
Guido van Rossumd57fd912000-03-10 22:53:23 +00005600 /* Coerce the two arguments */
5601 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
5602 if (u == NULL)
5603 goto onError;
5604 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
5605 if (v == NULL)
5606 goto onError;
5607
5608 /* Shortcuts */
5609 if (v == unicode_empty) {
5610 Py_DECREF(v);
5611 return (PyObject *)u;
5612 }
5613 if (u == unicode_empty) {
5614 Py_DECREF(u);
5615 return (PyObject *)v;
5616 }
5617
5618 /* Concat the two Unicode strings */
5619 w = _PyUnicode_New(u->length + v->length);
5620 if (w == NULL)
5621 goto onError;
5622 Py_UNICODE_COPY(w->str, u->str, u->length);
5623 Py_UNICODE_COPY(w->str + u->length, v->str, v->length);
5624
5625 Py_DECREF(u);
5626 Py_DECREF(v);
5627 return (PyObject *)w;
5628
5629onError:
5630 Py_XDECREF(u);
5631 Py_XDECREF(v);
5632 return NULL;
5633}
5634
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005635PyDoc_STRVAR(count__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005636"S.count(sub[, start[, end]]) -> int\n\
5637\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00005638Return the number of non-overlapping occurrences of substring sub in\n\
5639Unicode string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005640interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005641
5642static PyObject *
5643unicode_count(PyUnicodeObject *self, PyObject *args)
5644{
5645 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005646 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005647 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005648 PyObject *result;
5649
Guido van Rossumb8872e62000-05-09 14:14:27 +00005650 if (!PyArg_ParseTuple(args, "O|O&O&:count", &substring,
5651 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005652 return NULL;
5653
5654 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Thomas Wouters477c8d52006-05-27 19:21:47 +00005655 (PyObject *)substring);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005656 if (substring == NULL)
5657 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +00005658
Thomas Wouters477c8d52006-05-27 19:21:47 +00005659 FIX_START_END(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005660
Thomas Wouters477c8d52006-05-27 19:21:47 +00005661 result = PyInt_FromSsize_t(
5662 stringlib_count(self->str + start, end - start,
5663 substring->str, substring->length)
5664 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00005665
5666 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00005667
Guido van Rossumd57fd912000-03-10 22:53:23 +00005668 return result;
5669}
5670
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005671PyDoc_STRVAR(encode__doc__,
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00005672"S.encode([encoding[,errors]]) -> string or unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005673\n\
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00005674Encodes S using the codec registered for encoding. encoding defaults\n\
5675to the default encoding. errors may be given to set a different error\n\
Fred Drakee4315f52000-05-09 19:53:39 +00005676handling scheme. Default is 'strict' meaning that encoding errors raise\n\
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005677a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
5678'xmlcharrefreplace' as well as any other name registered with\n\
5679codecs.register_error that can handle UnicodeEncodeErrors.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005680
5681static PyObject *
5682unicode_encode(PyUnicodeObject *self, PyObject *args)
5683{
5684 char *encoding = NULL;
5685 char *errors = NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00005686 PyObject *v;
5687
Guido van Rossumd57fd912000-03-10 22:53:23 +00005688 if (!PyArg_ParseTuple(args, "|ss:encode", &encoding, &errors))
5689 return NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00005690 v = PyUnicode_AsEncodedObject((PyObject *)self, encoding, errors);
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00005691 if (v == NULL)
5692 goto onError;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00005693 if (!PyBytes_Check(v)) {
Guido van Rossum4355a472007-05-04 05:00:04 +00005694 if (PyString_Check(v)) {
5695 /* Old codec, turn it into bytes */
5696 PyObject *b = PyBytes_FromObject(v);
5697 Py_DECREF(v);
5698 return b;
5699 }
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00005700 PyErr_Format(PyExc_TypeError,
Guido van Rossumf15a29f2007-05-04 00:41:39 +00005701 "encoder did not return a bytes object "
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00005702 "(type=%.400s)",
5703 v->ob_type->tp_name);
5704 Py_DECREF(v);
5705 return NULL;
5706 }
5707 return v;
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00005708
5709 onError:
5710 return NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00005711}
5712
5713PyDoc_STRVAR(decode__doc__,
5714"S.decode([encoding[,errors]]) -> string or unicode\n\
5715\n\
5716Decodes S using the codec registered for encoding. encoding defaults\n\
5717to the default encoding. errors may be given to set a different error\n\
5718handling scheme. Default is 'strict' meaning that encoding errors raise\n\
5719a UnicodeDecodeError. Other possible values are 'ignore' and 'replace'\n\
5720as well as any other name registerd with codecs.register_error that is\n\
5721able to handle UnicodeDecodeErrors.");
5722
5723static PyObject *
Marc-André Lemburg126b44c2004-07-10 12:04:20 +00005724unicode_decode(PyUnicodeObject *self, PyObject *args)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00005725{
5726 char *encoding = NULL;
5727 char *errors = NULL;
5728 PyObject *v;
5729
5730 if (!PyArg_ParseTuple(args, "|ss:decode", &encoding, &errors))
5731 return NULL;
5732 v = PyUnicode_AsDecodedObject((PyObject *)self, encoding, errors);
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00005733 if (v == NULL)
5734 goto onError;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00005735 if (!PyString_Check(v) && !PyUnicode_Check(v)) {
5736 PyErr_Format(PyExc_TypeError,
5737 "decoder did not return a string/unicode object "
5738 "(type=%.400s)",
5739 v->ob_type->tp_name);
5740 Py_DECREF(v);
5741 return NULL;
5742 }
5743 return v;
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00005744
5745 onError:
5746 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005747}
5748
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005749PyDoc_STRVAR(expandtabs__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005750"S.expandtabs([tabsize]) -> unicode\n\
5751\n\
5752Return a copy of S where all tab characters are expanded using spaces.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005753If tabsize is not given, a tab size of 8 characters is assumed.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005754
5755static PyObject*
5756unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
5757{
5758 Py_UNICODE *e;
5759 Py_UNICODE *p;
5760 Py_UNICODE *q;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005761 Py_ssize_t i, j;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005762 PyUnicodeObject *u;
5763 int tabsize = 8;
5764
5765 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
5766 return NULL;
5767
Thomas Wouters7e474022000-07-16 12:04:32 +00005768 /* First pass: determine size of output string */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005769 i = j = 0;
5770 e = self->str + self->length;
5771 for (p = self->str; p < e; p++)
5772 if (*p == '\t') {
5773 if (tabsize > 0)
5774 j += tabsize - (j % tabsize);
5775 }
5776 else {
5777 j++;
5778 if (*p == '\n' || *p == '\r') {
5779 i += j;
5780 j = 0;
5781 }
5782 }
5783
5784 /* Second pass: create output string and fill it */
5785 u = _PyUnicode_New(i + j);
5786 if (!u)
5787 return NULL;
5788
5789 j = 0;
5790 q = u->str;
5791
5792 for (p = self->str; p < e; p++)
5793 if (*p == '\t') {
5794 if (tabsize > 0) {
5795 i = tabsize - (j % tabsize);
5796 j += i;
5797 while (i--)
5798 *q++ = ' ';
5799 }
5800 }
5801 else {
5802 j++;
5803 *q++ = *p;
5804 if (*p == '\n' || *p == '\r')
5805 j = 0;
5806 }
5807
5808 return (PyObject*) u;
5809}
5810
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005811PyDoc_STRVAR(find__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005812"S.find(sub [,start [,end]]) -> int\n\
5813\n\
5814Return the lowest index in S where substring sub is found,\n\
5815such that sub is contained within s[start,end]. Optional\n\
5816arguments start and end are interpreted as in slice notation.\n\
5817\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005818Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005819
5820static PyObject *
5821unicode_find(PyUnicodeObject *self, PyObject *args)
5822{
Thomas Wouters477c8d52006-05-27 19:21:47 +00005823 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005824 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005825 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005826 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005827
Guido van Rossumb8872e62000-05-09 14:14:27 +00005828 if (!PyArg_ParseTuple(args, "O|O&O&:find", &substring,
5829 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005830 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005831 substring = PyUnicode_FromObject(substring);
5832 if (!substring)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005833 return NULL;
5834
Thomas Wouters477c8d52006-05-27 19:21:47 +00005835 result = stringlib_find_slice(
5836 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
5837 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
5838 start, end
5839 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00005840
5841 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00005842
5843 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005844}
5845
5846static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +00005847unicode_getitem(PyUnicodeObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005848{
5849 if (index < 0 || index >= self->length) {
5850 PyErr_SetString(PyExc_IndexError, "string index out of range");
5851 return NULL;
5852 }
5853
5854 return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1);
5855}
5856
5857static long
Guido van Rossum09dc34f2007-05-04 04:17:33 +00005858unicode_hash(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005859{
Guido van Rossum09dc34f2007-05-04 04:17:33 +00005860 /* Since Unicode objects compare equal to their UTF-8 string
5861 counterparts, we hash the UTF-8 string. */
5862 PyObject *v = _PyUnicode_AsDefaultEncodedString(self, NULL);
5863 return PyObject_Hash(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005864}
5865
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005866PyDoc_STRVAR(index__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005867"S.index(sub [,start [,end]]) -> int\n\
5868\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005869Like S.find() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005870
5871static PyObject *
5872unicode_index(PyUnicodeObject *self, PyObject *args)
5873{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005874 Py_ssize_t result;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005875 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005876 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005877 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005878
Guido van Rossumb8872e62000-05-09 14:14:27 +00005879 if (!PyArg_ParseTuple(args, "O|O&O&:index", &substring,
5880 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005881 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005882 substring = PyUnicode_FromObject(substring);
5883 if (!substring)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005884 return NULL;
5885
Thomas Wouters477c8d52006-05-27 19:21:47 +00005886 result = stringlib_find_slice(
5887 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
5888 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
5889 start, end
5890 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00005891
5892 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00005893
Guido van Rossumd57fd912000-03-10 22:53:23 +00005894 if (result < 0) {
5895 PyErr_SetString(PyExc_ValueError, "substring not found");
5896 return NULL;
5897 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00005898
Martin v. Löwis18e16552006-02-15 17:27:45 +00005899 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005900}
5901
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005902PyDoc_STRVAR(islower__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005903"S.islower() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005904\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00005905Return True if all cased characters in S are lowercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005906at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005907
5908static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005909unicode_islower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005910{
5911 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5912 register const Py_UNICODE *e;
5913 int cased;
5914
Guido van Rossumd57fd912000-03-10 22:53:23 +00005915 /* Shortcut for single character strings */
5916 if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005917 return PyBool_FromLong(Py_UNICODE_ISLOWER(*p));
Guido van Rossumd57fd912000-03-10 22:53:23 +00005918
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005919 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00005920 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005921 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005922
Guido van Rossumd57fd912000-03-10 22:53:23 +00005923 e = p + PyUnicode_GET_SIZE(self);
5924 cased = 0;
5925 for (; p < e; p++) {
5926 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00005927
Guido van Rossumd57fd912000-03-10 22:53:23 +00005928 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005929 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005930 else if (!cased && Py_UNICODE_ISLOWER(ch))
5931 cased = 1;
5932 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00005933 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005934}
5935
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005936PyDoc_STRVAR(isupper__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005937"S.isupper() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005938\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00005939Return True if all cased characters in S are uppercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005940at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005941
5942static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005943unicode_isupper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005944{
5945 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5946 register const Py_UNICODE *e;
5947 int cased;
5948
Guido van Rossumd57fd912000-03-10 22:53:23 +00005949 /* Shortcut for single character strings */
5950 if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005951 return PyBool_FromLong(Py_UNICODE_ISUPPER(*p) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005952
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005953 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00005954 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005955 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005956
Guido van Rossumd57fd912000-03-10 22:53:23 +00005957 e = p + PyUnicode_GET_SIZE(self);
5958 cased = 0;
5959 for (; p < e; p++) {
5960 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00005961
Guido van Rossumd57fd912000-03-10 22:53:23 +00005962 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005963 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005964 else if (!cased && Py_UNICODE_ISUPPER(ch))
5965 cased = 1;
5966 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00005967 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005968}
5969
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005970PyDoc_STRVAR(istitle__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005971"S.istitle() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005972\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00005973Return True if S is a titlecased string and there is at least one\n\
5974character in S, i.e. upper- and titlecase characters may only\n\
5975follow uncased characters and lowercase characters only cased ones.\n\
5976Return False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005977
5978static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005979unicode_istitle(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005980{
5981 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5982 register const Py_UNICODE *e;
5983 int cased, previous_is_cased;
5984
Guido van Rossumd57fd912000-03-10 22:53:23 +00005985 /* Shortcut for single character strings */
5986 if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005987 return PyBool_FromLong((Py_UNICODE_ISTITLE(*p) != 0) ||
5988 (Py_UNICODE_ISUPPER(*p) != 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +00005989
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005990 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00005991 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005992 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005993
Guido van Rossumd57fd912000-03-10 22:53:23 +00005994 e = p + PyUnicode_GET_SIZE(self);
5995 cased = 0;
5996 previous_is_cased = 0;
5997 for (; p < e; p++) {
5998 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00005999
Guido van Rossumd57fd912000-03-10 22:53:23 +00006000 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
6001 if (previous_is_cased)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006002 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006003 previous_is_cased = 1;
6004 cased = 1;
6005 }
6006 else if (Py_UNICODE_ISLOWER(ch)) {
6007 if (!previous_is_cased)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006008 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006009 previous_is_cased = 1;
6010 cased = 1;
6011 }
6012 else
6013 previous_is_cased = 0;
6014 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006015 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006016}
6017
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006018PyDoc_STRVAR(isspace__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006019"S.isspace() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006020\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006021Return True if all characters in S are whitespace\n\
6022and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006023
6024static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006025unicode_isspace(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006026{
6027 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6028 register const Py_UNICODE *e;
6029
Guido van Rossumd57fd912000-03-10 22:53:23 +00006030 /* Shortcut for single character strings */
6031 if (PyUnicode_GET_SIZE(self) == 1 &&
6032 Py_UNICODE_ISSPACE(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006033 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006034
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006035 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006036 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006037 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006038
Guido van Rossumd57fd912000-03-10 22:53:23 +00006039 e = p + PyUnicode_GET_SIZE(self);
6040 for (; p < e; p++) {
6041 if (!Py_UNICODE_ISSPACE(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006042 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006043 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006044 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006045}
6046
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006047PyDoc_STRVAR(isalpha__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006048"S.isalpha() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006049\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006050Return True if all characters in S are alphabetic\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006051and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006052
6053static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006054unicode_isalpha(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006055{
6056 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6057 register const Py_UNICODE *e;
6058
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006059 /* Shortcut for single character strings */
6060 if (PyUnicode_GET_SIZE(self) == 1 &&
6061 Py_UNICODE_ISALPHA(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006062 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006063
6064 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006065 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006066 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006067
6068 e = p + PyUnicode_GET_SIZE(self);
6069 for (; p < e; p++) {
6070 if (!Py_UNICODE_ISALPHA(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006071 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006072 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006073 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006074}
6075
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006076PyDoc_STRVAR(isalnum__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006077"S.isalnum() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006078\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006079Return True if all characters in S are alphanumeric\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006080and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006081
6082static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006083unicode_isalnum(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006084{
6085 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6086 register const Py_UNICODE *e;
6087
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006088 /* Shortcut for single character strings */
6089 if (PyUnicode_GET_SIZE(self) == 1 &&
6090 Py_UNICODE_ISALNUM(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006091 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006092
6093 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006094 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006095 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006096
6097 e = p + PyUnicode_GET_SIZE(self);
6098 for (; p < e; p++) {
6099 if (!Py_UNICODE_ISALNUM(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006100 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006101 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006102 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006103}
6104
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006105PyDoc_STRVAR(isdecimal__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006106"S.isdecimal() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006107\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00006108Return True if there are only decimal characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006109False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006110
6111static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006112unicode_isdecimal(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006113{
6114 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6115 register const Py_UNICODE *e;
6116
Guido van Rossumd57fd912000-03-10 22:53:23 +00006117 /* Shortcut for single character strings */
6118 if (PyUnicode_GET_SIZE(self) == 1 &&
6119 Py_UNICODE_ISDECIMAL(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006120 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006121
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006122 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006123 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006124 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006125
Guido van Rossumd57fd912000-03-10 22:53:23 +00006126 e = p + PyUnicode_GET_SIZE(self);
6127 for (; p < e; p++) {
6128 if (!Py_UNICODE_ISDECIMAL(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006129 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006130 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006131 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006132}
6133
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006134PyDoc_STRVAR(isdigit__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006135"S.isdigit() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006136\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006137Return True if all characters in S are digits\n\
6138and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006139
6140static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006141unicode_isdigit(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006142{
6143 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6144 register const Py_UNICODE *e;
6145
Guido van Rossumd57fd912000-03-10 22:53:23 +00006146 /* Shortcut for single character strings */
6147 if (PyUnicode_GET_SIZE(self) == 1 &&
6148 Py_UNICODE_ISDIGIT(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006149 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006150
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006151 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006152 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006153 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006154
Guido van Rossumd57fd912000-03-10 22:53:23 +00006155 e = p + PyUnicode_GET_SIZE(self);
6156 for (; p < e; p++) {
6157 if (!Py_UNICODE_ISDIGIT(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006158 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006159 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006160 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006161}
6162
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006163PyDoc_STRVAR(isnumeric__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006164"S.isnumeric() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006165\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00006166Return True if there are only numeric characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006167False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006168
6169static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006170unicode_isnumeric(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006171{
6172 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6173 register const Py_UNICODE *e;
6174
Guido van Rossumd57fd912000-03-10 22:53:23 +00006175 /* Shortcut for single character strings */
6176 if (PyUnicode_GET_SIZE(self) == 1 &&
6177 Py_UNICODE_ISNUMERIC(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006178 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006179
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006180 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006181 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006182 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006183
Guido van Rossumd57fd912000-03-10 22:53:23 +00006184 e = p + PyUnicode_GET_SIZE(self);
6185 for (; p < e; p++) {
6186 if (!Py_UNICODE_ISNUMERIC(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006187 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006188 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006189 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006190}
6191
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006192PyDoc_STRVAR(join__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006193"S.join(sequence) -> unicode\n\
6194\n\
6195Return a string which is the concatenation of the strings in the\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006196sequence. The separator between elements is S.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006197
6198static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006199unicode_join(PyObject *self, PyObject *data)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006200{
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006201 return PyUnicode_Join(self, data);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006202}
6203
Martin v. Löwis18e16552006-02-15 17:27:45 +00006204static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +00006205unicode_length(PyUnicodeObject *self)
6206{
6207 return self->length;
6208}
6209
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006210PyDoc_STRVAR(ljust__doc__,
Hye-Shik Chang974ed7c2004-06-02 16:49:17 +00006211"S.ljust(width[, fillchar]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006212\n\
6213Return S left justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006214done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006215
6216static PyObject *
6217unicode_ljust(PyUnicodeObject *self, PyObject *args)
6218{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006219 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006220 Py_UNICODE fillchar = ' ';
6221
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006222 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006223 return NULL;
6224
Tim Peters7a29bd52001-09-12 03:03:31 +00006225 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006226 Py_INCREF(self);
6227 return (PyObject*) self;
6228 }
6229
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006230 return (PyObject*) pad(self, 0, width - self->length, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006231}
6232
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006233PyDoc_STRVAR(lower__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006234"S.lower() -> unicode\n\
6235\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006236Return a copy of the string S converted to lowercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006237
6238static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006239unicode_lower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006240{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006241 return fixup(self, fixlower);
6242}
6243
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006244#define LEFTSTRIP 0
6245#define RIGHTSTRIP 1
6246#define BOTHSTRIP 2
6247
6248/* Arrays indexed by above */
6249static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
6250
6251#define STRIPNAME(i) (stripformat[i]+3)
6252
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006253/* externally visible for str.strip(unicode) */
6254PyObject *
6255_PyUnicode_XStrip(PyUnicodeObject *self, int striptype, PyObject *sepobj)
6256{
6257 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
Martin v. Löwis18e16552006-02-15 17:27:45 +00006258 Py_ssize_t len = PyUnicode_GET_SIZE(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006259 Py_UNICODE *sep = PyUnicode_AS_UNICODE(sepobj);
Martin v. Löwis18e16552006-02-15 17:27:45 +00006260 Py_ssize_t seplen = PyUnicode_GET_SIZE(sepobj);
6261 Py_ssize_t i, j;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006262
Thomas Wouters477c8d52006-05-27 19:21:47 +00006263 BLOOM_MASK sepmask = make_bloom_mask(sep, seplen);
6264
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006265 i = 0;
6266 if (striptype != RIGHTSTRIP) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00006267 while (i < len && BLOOM_MEMBER(sepmask, s[i], sep, seplen)) {
6268 i++;
6269 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006270 }
6271
6272 j = len;
6273 if (striptype != LEFTSTRIP) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00006274 do {
6275 j--;
6276 } while (j >= i && BLOOM_MEMBER(sepmask, s[j], sep, seplen));
6277 j++;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006278 }
6279
6280 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00006281 Py_INCREF(self);
6282 return (PyObject*)self;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006283 }
6284 else
Thomas Wouters477c8d52006-05-27 19:21:47 +00006285 return PyUnicode_FromUnicode(s+i, j-i);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006286}
6287
Guido van Rossumd57fd912000-03-10 22:53:23 +00006288
6289static PyObject *
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006290do_strip(PyUnicodeObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006291{
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006292 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
Martin v. Löwis18e16552006-02-15 17:27:45 +00006293 Py_ssize_t len = PyUnicode_GET_SIZE(self), i, j;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006294
6295 i = 0;
6296 if (striptype != RIGHTSTRIP) {
6297 while (i < len && Py_UNICODE_ISSPACE(s[i])) {
6298 i++;
6299 }
6300 }
6301
6302 j = len;
6303 if (striptype != LEFTSTRIP) {
6304 do {
6305 j--;
6306 } while (j >= i && Py_UNICODE_ISSPACE(s[j]));
6307 j++;
6308 }
6309
6310 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
6311 Py_INCREF(self);
6312 return (PyObject*)self;
6313 }
6314 else
6315 return PyUnicode_FromUnicode(s+i, j-i);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006316}
6317
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006318
6319static PyObject *
6320do_argstrip(PyUnicodeObject *self, int striptype, PyObject *args)
6321{
6322 PyObject *sep = NULL;
6323
6324 if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
6325 return NULL;
6326
6327 if (sep != NULL && sep != Py_None) {
6328 if (PyUnicode_Check(sep))
6329 return _PyUnicode_XStrip(self, striptype, sep);
6330 else if (PyString_Check(sep)) {
6331 PyObject *res;
6332 sep = PyUnicode_FromObject(sep);
6333 if (sep==NULL)
6334 return NULL;
6335 res = _PyUnicode_XStrip(self, striptype, sep);
6336 Py_DECREF(sep);
6337 return res;
6338 }
6339 else {
6340 PyErr_Format(PyExc_TypeError,
6341 "%s arg must be None, unicode or str",
6342 STRIPNAME(striptype));
6343 return NULL;
6344 }
6345 }
6346
6347 return do_strip(self, striptype);
6348}
6349
6350
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006351PyDoc_STRVAR(strip__doc__,
Neal Norwitzffe33b72003-04-10 22:35:32 +00006352"S.strip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006353\n\
6354Return a copy of the string S with leading and trailing\n\
6355whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00006356If chars is given and not None, remove characters in chars instead.\n\
6357If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006358
6359static PyObject *
6360unicode_strip(PyUnicodeObject *self, PyObject *args)
6361{
6362 if (PyTuple_GET_SIZE(args) == 0)
6363 return do_strip(self, BOTHSTRIP); /* Common case */
6364 else
6365 return do_argstrip(self, BOTHSTRIP, args);
6366}
6367
6368
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006369PyDoc_STRVAR(lstrip__doc__,
Neal Norwitzffe33b72003-04-10 22:35:32 +00006370"S.lstrip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006371\n\
6372Return a copy of the string S with leading whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00006373If chars is given and not None, remove characters in chars instead.\n\
6374If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006375
6376static PyObject *
6377unicode_lstrip(PyUnicodeObject *self, PyObject *args)
6378{
6379 if (PyTuple_GET_SIZE(args) == 0)
6380 return do_strip(self, LEFTSTRIP); /* Common case */
6381 else
6382 return do_argstrip(self, LEFTSTRIP, args);
6383}
6384
6385
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006386PyDoc_STRVAR(rstrip__doc__,
Neal Norwitzffe33b72003-04-10 22:35:32 +00006387"S.rstrip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006388\n\
6389Return a copy of the string S with trailing whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00006390If chars is given and not None, remove characters in chars instead.\n\
6391If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006392
6393static PyObject *
6394unicode_rstrip(PyUnicodeObject *self, PyObject *args)
6395{
6396 if (PyTuple_GET_SIZE(args) == 0)
6397 return do_strip(self, RIGHTSTRIP); /* Common case */
6398 else
6399 return do_argstrip(self, RIGHTSTRIP, args);
6400}
6401
6402
Guido van Rossumd57fd912000-03-10 22:53:23 +00006403static PyObject*
Martin v. Löwis18e16552006-02-15 17:27:45 +00006404unicode_repeat(PyUnicodeObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006405{
6406 PyUnicodeObject *u;
6407 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006408 Py_ssize_t nchars;
Tim Peters8f422462000-09-09 06:13:41 +00006409 size_t nbytes;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006410
6411 if (len < 0)
6412 len = 0;
6413
Tim Peters7a29bd52001-09-12 03:03:31 +00006414 if (len == 1 && PyUnicode_CheckExact(str)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006415 /* no repeat, return original string */
6416 Py_INCREF(str);
6417 return (PyObject*) str;
6418 }
Tim Peters8f422462000-09-09 06:13:41 +00006419
6420 /* ensure # of chars needed doesn't overflow int and # of bytes
6421 * needed doesn't overflow size_t
6422 */
6423 nchars = len * str->length;
6424 if (len && nchars / len != str->length) {
6425 PyErr_SetString(PyExc_OverflowError,
6426 "repeated string is too long");
6427 return NULL;
6428 }
6429 nbytes = (nchars + 1) * sizeof(Py_UNICODE);
6430 if (nbytes / sizeof(Py_UNICODE) != (size_t)(nchars + 1)) {
6431 PyErr_SetString(PyExc_OverflowError,
6432 "repeated string is too long");
6433 return NULL;
6434 }
6435 u = _PyUnicode_New(nchars);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006436 if (!u)
6437 return NULL;
6438
6439 p = u->str;
6440
Thomas Wouters477c8d52006-05-27 19:21:47 +00006441 if (str->length == 1 && len > 0) {
6442 Py_UNICODE_FILL(p, str->str[0], len);
6443 } else {
6444 Py_ssize_t done = 0; /* number of characters copied this far */
6445 if (done < nchars) {
6446 Py_UNICODE_COPY(p, str->str, str->length);
6447 done = str->length;
6448 }
6449 while (done < nchars) {
6450 int n = (done <= nchars-done) ? done : nchars-done;
6451 Py_UNICODE_COPY(p+done, p, n);
6452 done += n;
6453 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006454 }
6455
6456 return (PyObject*) u;
6457}
6458
6459PyObject *PyUnicode_Replace(PyObject *obj,
6460 PyObject *subobj,
6461 PyObject *replobj,
Martin v. Löwis18e16552006-02-15 17:27:45 +00006462 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006463{
6464 PyObject *self;
6465 PyObject *str1;
6466 PyObject *str2;
6467 PyObject *result;
6468
6469 self = PyUnicode_FromObject(obj);
6470 if (self == NULL)
6471 return NULL;
6472 str1 = PyUnicode_FromObject(subobj);
6473 if (str1 == NULL) {
6474 Py_DECREF(self);
6475 return NULL;
6476 }
6477 str2 = PyUnicode_FromObject(replobj);
6478 if (str2 == NULL) {
6479 Py_DECREF(self);
6480 Py_DECREF(str1);
6481 return NULL;
6482 }
Tim Petersced69f82003-09-16 20:30:58 +00006483 result = replace((PyUnicodeObject *)self,
6484 (PyUnicodeObject *)str1,
6485 (PyUnicodeObject *)str2,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006486 maxcount);
6487 Py_DECREF(self);
6488 Py_DECREF(str1);
6489 Py_DECREF(str2);
6490 return result;
6491}
6492
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006493PyDoc_STRVAR(replace__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006494"S.replace (old, new[, maxsplit]) -> unicode\n\
6495\n\
6496Return a copy of S with all occurrences of substring\n\
6497old replaced by new. If the optional argument maxsplit is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006498given, only the first maxsplit occurrences are replaced.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006499
6500static PyObject*
6501unicode_replace(PyUnicodeObject *self, PyObject *args)
6502{
6503 PyUnicodeObject *str1;
6504 PyUnicodeObject *str2;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006505 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006506 PyObject *result;
6507
Martin v. Löwis18e16552006-02-15 17:27:45 +00006508 if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006509 return NULL;
6510 str1 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str1);
6511 if (str1 == NULL)
6512 return NULL;
6513 str2 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str2);
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00006514 if (str2 == NULL) {
6515 Py_DECREF(str1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006516 return NULL;
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00006517 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006518
6519 result = replace(self, str1, str2, maxcount);
6520
6521 Py_DECREF(str1);
6522 Py_DECREF(str2);
6523 return result;
6524}
6525
6526static
6527PyObject *unicode_repr(PyObject *unicode)
6528{
Walter Dörwald79e913e2007-05-12 11:08:06 +00006529 PyObject *repr;
6530 char *p;
6531 Py_UNICODE *s = PyUnicode_AS_UNICODE(unicode);
6532 Py_ssize_t size = PyUnicode_GET_SIZE(unicode);
6533
6534 /* XXX(nnorwitz): rather than over-allocating, it would be
6535 better to choose a different scheme. Perhaps scan the
6536 first N-chars of the string and allocate based on that size.
6537 */
6538 /* Initial allocation is based on the longest-possible unichr
6539 escape.
6540
6541 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
6542 unichr, so in this case it's the longest unichr escape. In
6543 narrow (UTF-16) builds this is five chars per source unichr
6544 since there are two unichrs in the surrogate pair, so in narrow
6545 (UTF-16) builds it's not the longest unichr escape.
6546
6547 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
6548 so in the narrow (UTF-16) build case it's the longest unichr
6549 escape.
6550 */
6551
6552 repr = PyString_FromStringAndSize(NULL,
6553 2 /* quotes */
6554#ifdef Py_UNICODE_WIDE
6555 + 10*size
6556#else
6557 + 6*size
6558#endif
6559 + 1);
6560 if (repr == NULL)
6561 return NULL;
6562
6563 p = PyString_AS_STRING(repr);
6564
6565 /* Add quote */
6566 *p++ = (findchar(s, size, '\'') &&
6567 !findchar(s, size, '"')) ? '"' : '\'';
6568 while (size-- > 0) {
6569 Py_UNICODE ch = *s++;
6570
6571 /* Escape quotes and backslashes */
6572 if ((ch == (Py_UNICODE) PyString_AS_STRING(repr)[0]) || (ch == '\\')) {
6573 *p++ = '\\';
6574 *p++ = (char) ch;
6575 continue;
6576 }
6577
6578#ifdef Py_UNICODE_WIDE
6579 /* Map 21-bit characters to '\U00xxxxxx' */
6580 else if (ch >= 0x10000) {
6581 *p++ = '\\';
6582 *p++ = 'U';
6583 *p++ = hexdigits[(ch >> 28) & 0x0000000F];
6584 *p++ = hexdigits[(ch >> 24) & 0x0000000F];
6585 *p++ = hexdigits[(ch >> 20) & 0x0000000F];
6586 *p++ = hexdigits[(ch >> 16) & 0x0000000F];
6587 *p++ = hexdigits[(ch >> 12) & 0x0000000F];
6588 *p++ = hexdigits[(ch >> 8) & 0x0000000F];
6589 *p++ = hexdigits[(ch >> 4) & 0x0000000F];
6590 *p++ = hexdigits[ch & 0x0000000F];
6591 continue;
6592 }
6593#else
6594 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
6595 else if (ch >= 0xD800 && ch < 0xDC00) {
6596 Py_UNICODE ch2;
6597 Py_UCS4 ucs;
6598
6599 ch2 = *s++;
6600 size--;
6601 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
6602 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
6603 *p++ = '\\';
6604 *p++ = 'U';
6605 *p++ = hexdigits[(ucs >> 28) & 0x0000000F];
6606 *p++ = hexdigits[(ucs >> 24) & 0x0000000F];
6607 *p++ = hexdigits[(ucs >> 20) & 0x0000000F];
6608 *p++ = hexdigits[(ucs >> 16) & 0x0000000F];
6609 *p++ = hexdigits[(ucs >> 12) & 0x0000000F];
6610 *p++ = hexdigits[(ucs >> 8) & 0x0000000F];
6611 *p++ = hexdigits[(ucs >> 4) & 0x0000000F];
6612 *p++ = hexdigits[ucs & 0x0000000F];
6613 continue;
6614 }
6615 /* Fall through: isolated surrogates are copied as-is */
6616 s--;
6617 size++;
6618 }
6619#endif
6620
6621 /* Map 16-bit characters to '\uxxxx' */
6622 if (ch >= 256) {
6623 *p++ = '\\';
6624 *p++ = 'u';
6625 *p++ = hexdigits[(ch >> 12) & 0x000F];
6626 *p++ = hexdigits[(ch >> 8) & 0x000F];
6627 *p++ = hexdigits[(ch >> 4) & 0x000F];
6628 *p++ = hexdigits[ch & 0x000F];
6629 }
6630
6631 /* Map special whitespace to '\t', \n', '\r' */
6632 else if (ch == '\t') {
6633 *p++ = '\\';
6634 *p++ = 't';
6635 }
6636 else if (ch == '\n') {
6637 *p++ = '\\';
6638 *p++ = 'n';
6639 }
6640 else if (ch == '\r') {
6641 *p++ = '\\';
6642 *p++ = 'r';
6643 }
6644
6645 /* Map non-printable US ASCII to '\xhh' */
6646 else if (ch < ' ' || ch >= 0x7F) {
6647 *p++ = '\\';
6648 *p++ = 'x';
6649 *p++ = hexdigits[(ch >> 4) & 0x000F];
6650 *p++ = hexdigits[ch & 0x000F];
6651 }
6652
6653 /* Copy everything else as-is */
6654 else
6655 *p++ = (char) ch;
6656 }
6657 /* Add quote */
6658 *p++ = PyString_AS_STRING(repr)[0];
6659
6660 *p = '\0';
6661 _PyString_Resize(&repr, p - PyString_AS_STRING(repr));
6662 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006663}
6664
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006665PyDoc_STRVAR(rfind__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006666"S.rfind(sub [,start [,end]]) -> int\n\
6667\n\
6668Return the highest index in S where substring sub is found,\n\
6669such that sub is contained within s[start,end]. Optional\n\
6670arguments start and end are interpreted as in slice notation.\n\
6671\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006672Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006673
6674static PyObject *
6675unicode_rfind(PyUnicodeObject *self, PyObject *args)
6676{
Thomas Wouters477c8d52006-05-27 19:21:47 +00006677 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006678 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006679 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006680 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006681
Guido van Rossumb8872e62000-05-09 14:14:27 +00006682 if (!PyArg_ParseTuple(args, "O|O&O&:rfind", &substring,
6683 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006684 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006685 substring = PyUnicode_FromObject(substring);
6686 if (!substring)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006687 return NULL;
6688
Thomas Wouters477c8d52006-05-27 19:21:47 +00006689 result = stringlib_rfind_slice(
6690 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
6691 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
6692 start, end
6693 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006694
6695 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00006696
6697 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006698}
6699
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006700PyDoc_STRVAR(rindex__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006701"S.rindex(sub [,start [,end]]) -> int\n\
6702\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006703Like S.rfind() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006704
6705static PyObject *
6706unicode_rindex(PyUnicodeObject *self, PyObject *args)
6707{
Thomas Wouters477c8d52006-05-27 19:21:47 +00006708 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006709 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006710 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006711 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006712
Guido van Rossumb8872e62000-05-09 14:14:27 +00006713 if (!PyArg_ParseTuple(args, "O|O&O&:rindex", &substring,
6714 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006715 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006716 substring = PyUnicode_FromObject(substring);
6717 if (!substring)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006718 return NULL;
6719
Thomas Wouters477c8d52006-05-27 19:21:47 +00006720 result = stringlib_rfind_slice(
6721 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
6722 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
6723 start, end
6724 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006725
6726 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00006727
Guido van Rossumd57fd912000-03-10 22:53:23 +00006728 if (result < 0) {
6729 PyErr_SetString(PyExc_ValueError, "substring not found");
6730 return NULL;
6731 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00006732 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006733}
6734
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006735PyDoc_STRVAR(rjust__doc__,
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006736"S.rjust(width[, fillchar]) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006737\n\
6738Return S right justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006739done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006740
6741static PyObject *
6742unicode_rjust(PyUnicodeObject *self, PyObject *args)
6743{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006744 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006745 Py_UNICODE fillchar = ' ';
6746
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006747 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006748 return NULL;
6749
Tim Peters7a29bd52001-09-12 03:03:31 +00006750 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006751 Py_INCREF(self);
6752 return (PyObject*) self;
6753 }
6754
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006755 return (PyObject*) pad(self, width - self->length, 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006756}
6757
Guido van Rossumd57fd912000-03-10 22:53:23 +00006758static PyObject*
Martin v. Löwis18e16552006-02-15 17:27:45 +00006759unicode_slice(PyUnicodeObject *self, Py_ssize_t start, Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006760{
6761 /* standard clamping */
6762 if (start < 0)
6763 start = 0;
6764 if (end < 0)
6765 end = 0;
6766 if (end > self->length)
6767 end = self->length;
Tim Peters7a29bd52001-09-12 03:03:31 +00006768 if (start == 0 && end == self->length && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006769 /* full slice, return original string */
6770 Py_INCREF(self);
6771 return (PyObject*) self;
6772 }
6773 if (start > end)
6774 start = end;
6775 /* copy slice */
6776 return (PyObject*) PyUnicode_FromUnicode(self->str + start,
6777 end - start);
6778}
6779
6780PyObject *PyUnicode_Split(PyObject *s,
6781 PyObject *sep,
Martin v. Löwis18e16552006-02-15 17:27:45 +00006782 Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006783{
6784 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00006785
Guido van Rossumd57fd912000-03-10 22:53:23 +00006786 s = PyUnicode_FromObject(s);
6787 if (s == NULL)
6788 return NULL;
6789 if (sep != NULL) {
6790 sep = PyUnicode_FromObject(sep);
6791 if (sep == NULL) {
6792 Py_DECREF(s);
6793 return NULL;
6794 }
6795 }
6796
6797 result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
6798
6799 Py_DECREF(s);
6800 Py_XDECREF(sep);
6801 return result;
6802}
6803
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006804PyDoc_STRVAR(split__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006805"S.split([sep [,maxsplit]]) -> list of strings\n\
6806\n\
6807Return a list of the words in S, using sep as the\n\
6808delimiter string. If maxsplit is given, at most maxsplit\n\
Thomas Hellerca0d2cb2004-09-15 11:41:32 +00006809splits are done. If sep is not specified or is None,\n\
Walter Dörwald782afc52004-09-14 09:40:45 +00006810any whitespace string is a separator.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006811
6812static PyObject*
6813unicode_split(PyUnicodeObject *self, PyObject *args)
6814{
6815 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006816 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006817
Martin v. Löwis18e16552006-02-15 17:27:45 +00006818 if (!PyArg_ParseTuple(args, "|On:split", &substring, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006819 return NULL;
6820
6821 if (substring == Py_None)
6822 return split(self, NULL, maxcount);
6823 else if (PyUnicode_Check(substring))
6824 return split(self, (PyUnicodeObject *)substring, maxcount);
6825 else
6826 return PyUnicode_Split((PyObject *)self, substring, maxcount);
6827}
6828
Thomas Wouters477c8d52006-05-27 19:21:47 +00006829PyObject *
6830PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
6831{
6832 PyObject* str_obj;
6833 PyObject* sep_obj;
6834 PyObject* out;
6835
6836 str_obj = PyUnicode_FromObject(str_in);
6837 if (!str_obj)
6838 return NULL;
6839 sep_obj = PyUnicode_FromObject(sep_in);
6840 if (!sep_obj) {
6841 Py_DECREF(str_obj);
6842 return NULL;
6843 }
6844
6845 out = stringlib_partition(
6846 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
6847 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
6848 );
6849
6850 Py_DECREF(sep_obj);
6851 Py_DECREF(str_obj);
6852
6853 return out;
6854}
6855
6856
6857PyObject *
6858PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
6859{
6860 PyObject* str_obj;
6861 PyObject* sep_obj;
6862 PyObject* out;
6863
6864 str_obj = PyUnicode_FromObject(str_in);
6865 if (!str_obj)
6866 return NULL;
6867 sep_obj = PyUnicode_FromObject(sep_in);
6868 if (!sep_obj) {
6869 Py_DECREF(str_obj);
6870 return NULL;
6871 }
6872
6873 out = stringlib_rpartition(
6874 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
6875 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
6876 );
6877
6878 Py_DECREF(sep_obj);
6879 Py_DECREF(str_obj);
6880
6881 return out;
6882}
6883
6884PyDoc_STRVAR(partition__doc__,
6885"S.partition(sep) -> (head, sep, tail)\n\
6886\n\
6887Searches for the separator sep in S, and returns the part before it,\n\
6888the separator itself, and the part after it. If the separator is not\n\
6889found, returns S and two empty strings.");
6890
6891static PyObject*
6892unicode_partition(PyUnicodeObject *self, PyObject *separator)
6893{
6894 return PyUnicode_Partition((PyObject *)self, separator);
6895}
6896
6897PyDoc_STRVAR(rpartition__doc__,
Thomas Wouters89f507f2006-12-13 04:49:30 +00006898"S.rpartition(sep) -> (tail, sep, head)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00006899\n\
6900Searches for the separator sep in S, starting at the end of S, and returns\n\
6901the part before it, the separator itself, and the part after it. If the\n\
Thomas Wouters89f507f2006-12-13 04:49:30 +00006902separator is not found, returns two empty strings and S.");
Thomas Wouters477c8d52006-05-27 19:21:47 +00006903
6904static PyObject*
6905unicode_rpartition(PyUnicodeObject *self, PyObject *separator)
6906{
6907 return PyUnicode_RPartition((PyObject *)self, separator);
6908}
6909
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006910PyObject *PyUnicode_RSplit(PyObject *s,
6911 PyObject *sep,
Martin v. Löwis18e16552006-02-15 17:27:45 +00006912 Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006913{
6914 PyObject *result;
6915
6916 s = PyUnicode_FromObject(s);
6917 if (s == NULL)
6918 return NULL;
6919 if (sep != NULL) {
6920 sep = PyUnicode_FromObject(sep);
6921 if (sep == NULL) {
6922 Py_DECREF(s);
6923 return NULL;
6924 }
6925 }
6926
6927 result = rsplit((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
6928
6929 Py_DECREF(s);
6930 Py_XDECREF(sep);
6931 return result;
6932}
6933
6934PyDoc_STRVAR(rsplit__doc__,
6935"S.rsplit([sep [,maxsplit]]) -> list of strings\n\
6936\n\
6937Return a list of the words in S, using sep as the\n\
6938delimiter string, starting at the end of the string and\n\
6939working to the front. If maxsplit is given, at most maxsplit\n\
6940splits are done. If sep is not specified, any whitespace string\n\
6941is a separator.");
6942
6943static PyObject*
6944unicode_rsplit(PyUnicodeObject *self, PyObject *args)
6945{
6946 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006947 Py_ssize_t maxcount = -1;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006948
Martin v. Löwis18e16552006-02-15 17:27:45 +00006949 if (!PyArg_ParseTuple(args, "|On:rsplit", &substring, &maxcount))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006950 return NULL;
6951
6952 if (substring == Py_None)
6953 return rsplit(self, NULL, maxcount);
6954 else if (PyUnicode_Check(substring))
6955 return rsplit(self, (PyUnicodeObject *)substring, maxcount);
6956 else
6957 return PyUnicode_RSplit((PyObject *)self, substring, maxcount);
6958}
6959
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006960PyDoc_STRVAR(splitlines__doc__,
Guido van Rossum86662912000-04-11 15:38:46 +00006961"S.splitlines([keepends]]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006962\n\
6963Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +00006964Line breaks are not included in the resulting list unless keepends\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006965is given and true.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006966
6967static PyObject*
6968unicode_splitlines(PyUnicodeObject *self, PyObject *args)
6969{
Guido van Rossum86662912000-04-11 15:38:46 +00006970 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006971
Guido van Rossum86662912000-04-11 15:38:46 +00006972 if (!PyArg_ParseTuple(args, "|i:splitlines", &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006973 return NULL;
6974
Guido van Rossum86662912000-04-11 15:38:46 +00006975 return PyUnicode_Splitlines((PyObject *)self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006976}
6977
6978static
Guido van Rossumf15a29f2007-05-04 00:41:39 +00006979PyObject *unicode_str(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006980{
Guido van Rossumf15a29f2007-05-04 00:41:39 +00006981 PyObject *res = _PyUnicode_AsDefaultEncodedString(self, NULL);
6982 Py_XINCREF(res);
6983 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006984}
6985
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006986PyDoc_STRVAR(swapcase__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006987"S.swapcase() -> unicode\n\
6988\n\
6989Return a copy of S with uppercase characters converted to lowercase\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006990and vice versa.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006991
6992static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006993unicode_swapcase(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006994{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006995 return fixup(self, fixswapcase);
6996}
6997
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006998PyDoc_STRVAR(translate__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006999"S.translate(table) -> unicode\n\
7000\n\
7001Return a copy of the string S, where all characters have been mapped\n\
7002through the given translation table, which must be a mapping of\n\
Walter Dörwald5c1ee172002-09-04 20:31:32 +00007003Unicode ordinals to Unicode ordinals, Unicode strings or None.\n\
7004Unmapped characters are left untouched. Characters mapped to None\n\
7005are deleted.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007006
7007static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007008unicode_translate(PyUnicodeObject *self, PyObject *table)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007009{
Tim Petersced69f82003-09-16 20:30:58 +00007010 return PyUnicode_TranslateCharmap(self->str,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007011 self->length,
Tim Petersced69f82003-09-16 20:30:58 +00007012 table,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007013 "ignore");
7014}
7015
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007016PyDoc_STRVAR(upper__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007017"S.upper() -> unicode\n\
7018\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007019Return a copy of S converted to uppercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007020
7021static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007022unicode_upper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007023{
Guido van Rossumd57fd912000-03-10 22:53:23 +00007024 return fixup(self, fixupper);
7025}
7026
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007027PyDoc_STRVAR(zfill__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007028"S.zfill(width) -> unicode\n\
7029\n\
7030Pad a numeric string x with zeros on the left, to fill a field\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007031of the specified width. The string x is never truncated.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007032
7033static PyObject *
7034unicode_zfill(PyUnicodeObject *self, PyObject *args)
7035{
Martin v. Löwis18e16552006-02-15 17:27:45 +00007036 Py_ssize_t fill;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007037 PyUnicodeObject *u;
7038
Martin v. Löwis18e16552006-02-15 17:27:45 +00007039 Py_ssize_t width;
7040 if (!PyArg_ParseTuple(args, "n:zfill", &width))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007041 return NULL;
7042
7043 if (self->length >= width) {
Walter Dörwald0fe940c2002-04-15 18:42:15 +00007044 if (PyUnicode_CheckExact(self)) {
7045 Py_INCREF(self);
7046 return (PyObject*) self;
7047 }
7048 else
7049 return PyUnicode_FromUnicode(
7050 PyUnicode_AS_UNICODE(self),
7051 PyUnicode_GET_SIZE(self)
7052 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007053 }
7054
7055 fill = width - self->length;
7056
7057 u = pad(self, fill, 0, '0');
7058
Walter Dörwald068325e2002-04-15 13:36:47 +00007059 if (u == NULL)
7060 return NULL;
7061
Guido van Rossumd57fd912000-03-10 22:53:23 +00007062 if (u->str[fill] == '+' || u->str[fill] == '-') {
7063 /* move sign to beginning of string */
7064 u->str[0] = u->str[fill];
7065 u->str[fill] = '0';
7066 }
7067
7068 return (PyObject*) u;
7069}
Guido van Rossumd57fd912000-03-10 22:53:23 +00007070
7071#if 0
7072static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007073unicode_freelistsize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007074{
Guido van Rossumd57fd912000-03-10 22:53:23 +00007075 return PyInt_FromLong(unicode_freelist_size);
7076}
7077#endif
7078
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007079PyDoc_STRVAR(startswith__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00007080"S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007081\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00007082Return True if S starts with the specified prefix, False otherwise.\n\
7083With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007084With optional end, stop comparing S at that position.\n\
7085prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007086
7087static PyObject *
7088unicode_startswith(PyUnicodeObject *self,
7089 PyObject *args)
7090{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007091 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007092 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007093 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007094 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007095 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007096
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007097 if (!PyArg_ParseTuple(args, "O|O&O&:startswith", &subobj,
Guido van Rossumb8872e62000-05-09 14:14:27 +00007098 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007099 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007100 if (PyTuple_Check(subobj)) {
7101 Py_ssize_t i;
7102 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
7103 substring = (PyUnicodeObject *)PyUnicode_FromObject(
7104 PyTuple_GET_ITEM(subobj, i));
7105 if (substring == NULL)
7106 return NULL;
7107 result = tailmatch(self, substring, start, end, -1);
7108 Py_DECREF(substring);
7109 if (result) {
7110 Py_RETURN_TRUE;
7111 }
7112 }
7113 /* nothing matched */
7114 Py_RETURN_FALSE;
7115 }
7116 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007117 if (substring == NULL)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007118 return NULL;
7119 result = tailmatch(self, substring, start, end, -1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007120 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007121 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007122}
7123
7124
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007125PyDoc_STRVAR(endswith__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00007126"S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007127\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00007128Return True if S ends with the specified suffix, False otherwise.\n\
7129With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007130With optional end, stop comparing S at that position.\n\
7131suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007132
7133static PyObject *
7134unicode_endswith(PyUnicodeObject *self,
7135 PyObject *args)
7136{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007137 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007138 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007139 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007140 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007141 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007142
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007143 if (!PyArg_ParseTuple(args, "O|O&O&:endswith", &subobj,
7144 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007145 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007146 if (PyTuple_Check(subobj)) {
7147 Py_ssize_t i;
7148 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
7149 substring = (PyUnicodeObject *)PyUnicode_FromObject(
7150 PyTuple_GET_ITEM(subobj, i));
7151 if (substring == NULL)
7152 return NULL;
7153 result = tailmatch(self, substring, start, end, +1);
7154 Py_DECREF(substring);
7155 if (result) {
7156 Py_RETURN_TRUE;
7157 }
7158 }
7159 Py_RETURN_FALSE;
7160 }
7161 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007162 if (substring == NULL)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007163 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007164
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007165 result = tailmatch(self, substring, start, end, +1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007166 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007167 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007168}
7169
7170
Guido van Rossum5d9113d2003-01-29 17:58:45 +00007171
7172static PyObject *
7173unicode_getnewargs(PyUnicodeObject *v)
7174{
7175 return Py_BuildValue("(u#)", v->str, v->length);
7176}
7177
7178
Guido van Rossumd57fd912000-03-10 22:53:23 +00007179static PyMethodDef unicode_methods[] = {
7180
7181 /* Order is according to common usage: often used methods should
7182 appear first, since lookup is done sequentially. */
7183
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007184 {"encode", (PyCFunction) unicode_encode, METH_VARARGS, encode__doc__},
7185 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
7186 {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007187 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007188 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
7189 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
7190 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
7191 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
7192 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
7193 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
7194 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +00007195 {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007196 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
7197 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
7198 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007199 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00007200 {"decode", (PyCFunction) unicode_decode, METH_VARARGS, decode__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007201/* {"maketrans", (PyCFunction) unicode_maketrans, METH_VARARGS, maketrans__doc__}, */
7202 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
7203 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
7204 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007205 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +00007206 {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007207 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS, splitlines__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007208 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007209 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
7210 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
7211 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
7212 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
7213 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
7214 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
7215 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
7216 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
7217 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
7218 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
7219 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
7220 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
7221 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
7222 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007223 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
Walter Dörwald068325e2002-04-15 13:36:47 +00007224#if 0
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007225 {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
Guido van Rossumd57fd912000-03-10 22:53:23 +00007226#endif
7227
7228#if 0
7229 /* This one is just used for debugging the implementation. */
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007230 {"freelistsize", (PyCFunction) unicode_freelistsize, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00007231#endif
7232
Guido van Rossum5d9113d2003-01-29 17:58:45 +00007233 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00007234 {NULL, NULL}
7235};
7236
Neil Schemenauerce30bc92002-11-18 16:10:18 +00007237static PyObject *
7238unicode_mod(PyObject *v, PyObject *w)
7239{
7240 if (!PyUnicode_Check(v)) {
7241 Py_INCREF(Py_NotImplemented);
7242 return Py_NotImplemented;
7243 }
7244 return PyUnicode_Format(v, w);
7245}
7246
7247static PyNumberMethods unicode_as_number = {
7248 0, /*nb_add*/
7249 0, /*nb_subtract*/
7250 0, /*nb_multiply*/
Neil Schemenauerce30bc92002-11-18 16:10:18 +00007251 unicode_mod, /*nb_remainder*/
7252};
7253
Guido van Rossumd57fd912000-03-10 22:53:23 +00007254static PySequenceMethods unicode_as_sequence = {
Martin v. Löwis18e16552006-02-15 17:27:45 +00007255 (lenfunc) unicode_length, /* sq_length */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007256 PyUnicode_Concat, /* sq_concat */
Martin v. Löwis18e16552006-02-15 17:27:45 +00007257 (ssizeargfunc) unicode_repeat, /* sq_repeat */
7258 (ssizeargfunc) unicode_getitem, /* sq_item */
7259 (ssizessizeargfunc) unicode_slice, /* sq_slice */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007260 0, /* sq_ass_item */
7261 0, /* sq_ass_slice */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007262 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007263};
7264
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007265static PyObject*
7266unicode_subscript(PyUnicodeObject* self, PyObject* item)
7267{
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00007268 if (PyIndex_Check(item)) {
7269 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007270 if (i == -1 && PyErr_Occurred())
7271 return NULL;
7272 if (i < 0)
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007273 i += PyUnicode_GET_SIZE(self);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007274 return unicode_getitem(self, i);
7275 } else if (PySlice_Check(item)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00007276 Py_ssize_t start, stop, step, slicelength, cur, i;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007277 Py_UNICODE* source_buf;
7278 Py_UNICODE* result_buf;
7279 PyObject* result;
7280
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007281 if (PySlice_GetIndicesEx((PySliceObject*)item, PyUnicode_GET_SIZE(self),
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007282 &start, &stop, &step, &slicelength) < 0) {
7283 return NULL;
7284 }
7285
7286 if (slicelength <= 0) {
7287 return PyUnicode_FromUnicode(NULL, 0);
7288 } else {
7289 source_buf = PyUnicode_AS_UNICODE((PyObject*)self);
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007290 result_buf = (Py_UNICODE *)PyMem_MALLOC(slicelength*
7291 sizeof(Py_UNICODE));
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007292
7293 if (result_buf == NULL)
7294 return PyErr_NoMemory();
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007295
7296 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
7297 result_buf[i] = source_buf[cur];
7298 }
Tim Petersced69f82003-09-16 20:30:58 +00007299
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007300 result = PyUnicode_FromUnicode(result_buf, slicelength);
7301 PyMem_FREE(result_buf);
7302 return result;
7303 }
7304 } else {
7305 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
7306 return NULL;
7307 }
7308}
7309
7310static PyMappingMethods unicode_as_mapping = {
Martin v. Löwis18e16552006-02-15 17:27:45 +00007311 (lenfunc)unicode_length, /* mp_length */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007312 (binaryfunc)unicode_subscript, /* mp_subscript */
7313 (objobjargproc)0, /* mp_ass_subscript */
7314};
7315
Martin v. Löwis18e16552006-02-15 17:27:45 +00007316static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +00007317unicode_buffer_getreadbuf(PyUnicodeObject *self,
Martin v. Löwis18e16552006-02-15 17:27:45 +00007318 Py_ssize_t index,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007319 const void **ptr)
7320{
7321 if (index != 0) {
7322 PyErr_SetString(PyExc_SystemError,
7323 "accessing non-existent unicode segment");
7324 return -1;
7325 }
7326 *ptr = (void *) self->str;
7327 return PyUnicode_GET_DATA_SIZE(self);
7328}
7329
Martin v. Löwis18e16552006-02-15 17:27:45 +00007330static Py_ssize_t
7331unicode_buffer_getwritebuf(PyUnicodeObject *self, Py_ssize_t index,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007332 const void **ptr)
7333{
7334 PyErr_SetString(PyExc_TypeError,
Neal Norwitz20e72132002-06-13 21:25:17 +00007335 "cannot use unicode as modifiable buffer");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007336 return -1;
7337}
7338
7339static int
7340unicode_buffer_getsegcount(PyUnicodeObject *self,
Martin v. Löwis18e16552006-02-15 17:27:45 +00007341 Py_ssize_t *lenp)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007342{
7343 if (lenp)
7344 *lenp = PyUnicode_GET_DATA_SIZE(self);
7345 return 1;
7346}
7347
Martin v. Löwiseb079f12006-02-16 14:32:27 +00007348static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +00007349unicode_buffer_getcharbuf(PyUnicodeObject *self,
Martin v. Löwis18e16552006-02-15 17:27:45 +00007350 Py_ssize_t index,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007351 const void **ptr)
7352{
7353 PyObject *str;
Tim Petersced69f82003-09-16 20:30:58 +00007354
Guido van Rossumd57fd912000-03-10 22:53:23 +00007355 if (index != 0) {
7356 PyErr_SetString(PyExc_SystemError,
7357 "accessing non-existent unicode segment");
7358 return -1;
7359 }
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00007360 str = _PyUnicode_AsDefaultEncodedString((PyObject *)self, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007361 if (str == NULL)
7362 return -1;
7363 *ptr = (void *) PyString_AS_STRING(str);
7364 return PyString_GET_SIZE(str);
7365}
7366
7367/* Helpers for PyUnicode_Format() */
7368
7369static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +00007370getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007371{
Martin v. Löwis18e16552006-02-15 17:27:45 +00007372 Py_ssize_t argidx = *p_argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007373 if (argidx < arglen) {
7374 (*p_argidx)++;
7375 if (arglen < 0)
7376 return args;
7377 else
7378 return PyTuple_GetItem(args, argidx);
7379 }
7380 PyErr_SetString(PyExc_TypeError,
7381 "not enough arguments for format string");
7382 return NULL;
7383}
7384
7385#define F_LJUST (1<<0)
7386#define F_SIGN (1<<1)
7387#define F_BLANK (1<<2)
7388#define F_ALT (1<<3)
7389#define F_ZERO (1<<4)
7390
Martin v. Löwis18e16552006-02-15 17:27:45 +00007391static Py_ssize_t
Neal Norwitzfc76d632006-01-10 06:03:13 +00007392strtounicode(Py_UNICODE *buffer, const char *charbuffer)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007393{
Martin v. Löwis18e16552006-02-15 17:27:45 +00007394 register Py_ssize_t i;
7395 Py_ssize_t len = strlen(charbuffer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007396 for (i = len - 1; i >= 0; i--)
7397 buffer[i] = (Py_UNICODE) charbuffer[i];
7398
Guido van Rossumd57fd912000-03-10 22:53:23 +00007399 return len;
7400}
7401
Neal Norwitzfc76d632006-01-10 06:03:13 +00007402static int
7403doubletounicode(Py_UNICODE *buffer, size_t len, const char *format, double x)
7404{
Tim Peters15231542006-02-16 01:08:01 +00007405 Py_ssize_t result;
7406
Neal Norwitzfc76d632006-01-10 06:03:13 +00007407 PyOS_ascii_formatd((char *)buffer, len, format, x);
Tim Peters15231542006-02-16 01:08:01 +00007408 result = strtounicode(buffer, (char *)buffer);
7409 return Py_SAFE_DOWNCAST(result, Py_ssize_t, int);
Neal Norwitzfc76d632006-01-10 06:03:13 +00007410}
7411
7412static int
7413longtounicode(Py_UNICODE *buffer, size_t len, const char *format, long x)
7414{
Tim Peters15231542006-02-16 01:08:01 +00007415 Py_ssize_t result;
7416
Neal Norwitzfc76d632006-01-10 06:03:13 +00007417 PyOS_snprintf((char *)buffer, len, format, x);
Tim Peters15231542006-02-16 01:08:01 +00007418 result = strtounicode(buffer, (char *)buffer);
7419 return Py_SAFE_DOWNCAST(result, Py_ssize_t, int);
Neal Norwitzfc76d632006-01-10 06:03:13 +00007420}
7421
Guido van Rossum078151d2002-08-11 04:24:12 +00007422/* XXX To save some code duplication, formatfloat/long/int could have been
7423 shared with stringobject.c, converting from 8-bit to Unicode after the
7424 formatting is done. */
7425
Guido van Rossumd57fd912000-03-10 22:53:23 +00007426static int
7427formatfloat(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007428 size_t buflen,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007429 int flags,
7430 int prec,
7431 int type,
7432 PyObject *v)
7433{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007434 /* fmt = '%#.' + `prec` + `type`
7435 worst case length = 3 + 10 (len of INT_MAX) + 1 = 14 (use 20)*/
Guido van Rossumd57fd912000-03-10 22:53:23 +00007436 char fmt[20];
7437 double x;
Tim Petersced69f82003-09-16 20:30:58 +00007438
Guido van Rossumd57fd912000-03-10 22:53:23 +00007439 x = PyFloat_AsDouble(v);
7440 if (x == -1.0 && PyErr_Occurred())
7441 return -1;
7442 if (prec < 0)
7443 prec = 6;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007444 if (type == 'f' && (fabs(x) / 1e25) >= 1e25)
7445 type = 'g';
Marc-André Lemburg79f57832002-12-29 19:44:06 +00007446 /* Worst case length calc to ensure no buffer overrun:
7447
7448 'g' formats:
7449 fmt = %#.<prec>g
7450 buf = '-' + [0-9]*prec + '.' + 'e+' + (longest exp
7451 for any double rep.)
7452 len = 1 + prec + 1 + 2 + 5 = 9 + prec
7453
7454 'f' formats:
7455 buf = '-' + [0-9]*x + '.' + [0-9]*prec (with x < 50)
7456 len = 1 + 50 + 1 + prec = 52 + prec
7457
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007458 If prec=0 the effective precision is 1 (the leading digit is
Tim Petersced69f82003-09-16 20:30:58 +00007459 always given), therefore increase the length by one.
Marc-André Lemburg79f57832002-12-29 19:44:06 +00007460
7461 */
7462 if ((type == 'g' && buflen <= (size_t)10 + (size_t)prec) ||
7463 (type == 'f' && buflen <= (size_t)53 + (size_t)prec)) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007464 PyErr_SetString(PyExc_OverflowError,
Marc-André Lemburg79f57832002-12-29 19:44:06 +00007465 "formatted float is too long (precision too large?)");
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007466 return -1;
7467 }
Marc-André Lemburg79f57832002-12-29 19:44:06 +00007468 PyOS_snprintf(fmt, sizeof(fmt), "%%%s.%d%c",
7469 (flags&F_ALT) ? "#" : "",
7470 prec, type);
Neal Norwitzfc76d632006-01-10 06:03:13 +00007471 return doubletounicode(buf, buflen, fmt, x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007472}
7473
Tim Peters38fd5b62000-09-21 05:43:11 +00007474static PyObject*
7475formatlong(PyObject *val, int flags, int prec, int type)
7476{
7477 char *buf;
7478 int i, len;
7479 PyObject *str; /* temporary string object. */
7480 PyUnicodeObject *result;
7481
7482 str = _PyString_FormatLong(val, flags, prec, type, &buf, &len);
7483 if (!str)
7484 return NULL;
7485 result = _PyUnicode_New(len);
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00007486 if (!result) {
7487 Py_DECREF(str);
7488 return NULL;
7489 }
Tim Peters38fd5b62000-09-21 05:43:11 +00007490 for (i = 0; i < len; i++)
7491 result->str[i] = buf[i];
7492 result->str[len] = 0;
7493 Py_DECREF(str);
7494 return (PyObject*)result;
7495}
7496
Guido van Rossumd57fd912000-03-10 22:53:23 +00007497static int
7498formatint(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007499 size_t buflen,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007500 int flags,
7501 int prec,
7502 int type,
7503 PyObject *v)
7504{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007505 /* fmt = '%#.' + `prec` + 'l' + `type`
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00007506 * worst case length = 3 + 19 (worst len of INT_MAX on 64-bit machine)
7507 * + 1 + 1
7508 * = 24
7509 */
Tim Peters38fd5b62000-09-21 05:43:11 +00007510 char fmt[64]; /* plenty big enough! */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00007511 char *sign;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007512 long x;
7513
7514 x = PyInt_AsLong(v);
7515 if (x == -1 && PyErr_Occurred())
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00007516 return -1;
Guido van Rossum6c9e1302003-11-29 23:52:13 +00007517 if (x < 0 && type == 'u') {
7518 type = 'd';
Guido van Rossum078151d2002-08-11 04:24:12 +00007519 }
Guido van Rossum6c9e1302003-11-29 23:52:13 +00007520 if (x < 0 && (type == 'x' || type == 'X' || type == 'o'))
7521 sign = "-";
7522 else
7523 sign = "";
Guido van Rossumd57fd912000-03-10 22:53:23 +00007524 if (prec < 0)
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00007525 prec = 1;
7526
Guido van Rossum6c9e1302003-11-29 23:52:13 +00007527 /* buf = '+'/'-'/'' + '0'/'0x'/'' + '[0-9]'*max(prec, len(x in octal))
7528 * worst case buf = '-0x' + [0-9]*prec, where prec >= 11
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00007529 */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00007530 if (buflen <= 14 || buflen <= (size_t)3 + (size_t)prec) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007531 PyErr_SetString(PyExc_OverflowError,
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00007532 "formatted integer is too long (precision too large?)");
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007533 return -1;
7534 }
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00007535
7536 if ((flags & F_ALT) &&
7537 (type == 'x' || type == 'X')) {
Tim Petersced69f82003-09-16 20:30:58 +00007538 /* When converting under %#x or %#X, there are a number
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00007539 * of issues that cause pain:
7540 * - when 0 is being converted, the C standard leaves off
7541 * the '0x' or '0X', which is inconsistent with other
7542 * %#x/%#X conversions and inconsistent with Python's
7543 * hex() function
7544 * - there are platforms that violate the standard and
7545 * convert 0 with the '0x' or '0X'
7546 * (Metrowerks, Compaq Tru64)
7547 * - there are platforms that give '0x' when converting
Tim Petersced69f82003-09-16 20:30:58 +00007548 * under %#X, but convert 0 in accordance with the
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00007549 * standard (OS/2 EMX)
Tim Petersced69f82003-09-16 20:30:58 +00007550 *
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00007551 * We can achieve the desired consistency by inserting our
7552 * own '0x' or '0X' prefix, and substituting %x/%X in place
7553 * of %#x/%#X.
7554 *
7555 * Note that this is the same approach as used in
7556 * formatint() in stringobject.c
Andrew MacIntyrec4874392002-02-26 11:36:35 +00007557 */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00007558 PyOS_snprintf(fmt, sizeof(fmt), "%s0%c%%.%dl%c",
7559 sign, type, prec, type);
Andrew MacIntyrec4874392002-02-26 11:36:35 +00007560 }
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00007561 else {
Guido van Rossum6c9e1302003-11-29 23:52:13 +00007562 PyOS_snprintf(fmt, sizeof(fmt), "%s%%%s.%dl%c",
7563 sign, (flags&F_ALT) ? "#" : "",
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00007564 prec, type);
Tim Petersb3d8d1f2001-04-28 05:38:26 +00007565 }
Guido van Rossum6c9e1302003-11-29 23:52:13 +00007566 if (sign[0])
Neal Norwitzfc76d632006-01-10 06:03:13 +00007567 return longtounicode(buf, buflen, fmt, -x);
Guido van Rossum6c9e1302003-11-29 23:52:13 +00007568 else
Neal Norwitzfc76d632006-01-10 06:03:13 +00007569 return longtounicode(buf, buflen, fmt, x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007570}
7571
7572static int
7573formatchar(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007574 size_t buflen,
7575 PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007576{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007577 /* presume that the buffer is at least 2 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00007578 if (PyUnicode_Check(v)) {
7579 if (PyUnicode_GET_SIZE(v) != 1)
7580 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007581 buf[0] = PyUnicode_AS_UNICODE(v)[0];
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00007582 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007583
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00007584 else if (PyString_Check(v)) {
Tim Petersced69f82003-09-16 20:30:58 +00007585 if (PyString_GET_SIZE(v) != 1)
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00007586 goto onError;
7587 buf[0] = (Py_UNICODE)PyString_AS_STRING(v)[0];
7588 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007589
7590 else {
7591 /* Integer input truncated to a character */
7592 long x;
7593 x = PyInt_AsLong(v);
7594 if (x == -1 && PyErr_Occurred())
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00007595 goto onError;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00007596#ifdef Py_UNICODE_WIDE
7597 if (x < 0 || x > 0x10ffff) {
Walter Dörwald44f527f2003-04-02 16:37:24 +00007598 PyErr_SetString(PyExc_OverflowError,
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00007599 "%c arg not in range(0x110000) "
7600 "(wide Python build)");
7601 return -1;
7602 }
7603#else
7604 if (x < 0 || x > 0xffff) {
Walter Dörwald44f527f2003-04-02 16:37:24 +00007605 PyErr_SetString(PyExc_OverflowError,
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00007606 "%c arg not in range(0x10000) "
7607 "(narrow Python build)");
7608 return -1;
7609 }
7610#endif
7611 buf[0] = (Py_UNICODE) x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007612 }
7613 buf[1] = '\0';
7614 return 1;
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00007615
7616 onError:
7617 PyErr_SetString(PyExc_TypeError,
7618 "%c requires int or char");
7619 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007620}
7621
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007622/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
7623
7624 FORMATBUFLEN is the length of the buffer in which the floats, ints, &
7625 chars are formatted. XXX This is a magic number. Each formatting
7626 routine does bounds checking to ensure no overflow, but a better
7627 solution may be to malloc a buffer of appropriate size for each
7628 format. For now, the current solution is sufficient.
7629*/
7630#define FORMATBUFLEN (size_t)120
7631
Guido van Rossumd57fd912000-03-10 22:53:23 +00007632PyObject *PyUnicode_Format(PyObject *format,
7633 PyObject *args)
7634{
7635 Py_UNICODE *fmt, *res;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007636 Py_ssize_t fmtcnt, rescnt, reslen, arglen, argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007637 int args_owned = 0;
7638 PyUnicodeObject *result = NULL;
7639 PyObject *dict = NULL;
7640 PyObject *uformat;
Tim Petersced69f82003-09-16 20:30:58 +00007641
Guido van Rossumd57fd912000-03-10 22:53:23 +00007642 if (format == NULL || args == NULL) {
7643 PyErr_BadInternalCall();
7644 return NULL;
7645 }
7646 uformat = PyUnicode_FromObject(format);
Fred Drakee4315f52000-05-09 19:53:39 +00007647 if (uformat == NULL)
7648 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007649 fmt = PyUnicode_AS_UNICODE(uformat);
7650 fmtcnt = PyUnicode_GET_SIZE(uformat);
7651
7652 reslen = rescnt = fmtcnt + 100;
7653 result = _PyUnicode_New(reslen);
7654 if (result == NULL)
7655 goto onError;
7656 res = PyUnicode_AS_UNICODE(result);
7657
7658 if (PyTuple_Check(args)) {
7659 arglen = PyTuple_Size(args);
7660 argidx = 0;
7661 }
7662 else {
7663 arglen = -1;
7664 argidx = -2;
7665 }
Neal Norwitz80a1bf42002-11-12 23:01:12 +00007666 if (args->ob_type->tp_as_mapping && !PyTuple_Check(args) &&
7667 !PyObject_TypeCheck(args, &PyBaseString_Type))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007668 dict = args;
7669
7670 while (--fmtcnt >= 0) {
7671 if (*fmt != '%') {
7672 if (--rescnt < 0) {
7673 rescnt = fmtcnt + 100;
7674 reslen += rescnt;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00007675 if (_PyUnicode_Resize(&result, reslen) < 0)
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00007676 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007677 res = PyUnicode_AS_UNICODE(result) + reslen - rescnt;
7678 --rescnt;
7679 }
7680 *res++ = *fmt++;
7681 }
7682 else {
7683 /* Got a format specifier */
7684 int flags = 0;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007685 Py_ssize_t width = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007686 int prec = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007687 Py_UNICODE c = '\0';
7688 Py_UNICODE fill;
7689 PyObject *v = NULL;
7690 PyObject *temp = NULL;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007691 Py_UNICODE *pbuf;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007692 Py_UNICODE sign;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007693 Py_ssize_t len;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007694 Py_UNICODE formatbuf[FORMATBUFLEN]; /* For format{float,int,char}() */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007695
7696 fmt++;
7697 if (*fmt == '(') {
7698 Py_UNICODE *keystart;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007699 Py_ssize_t keylen;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007700 PyObject *key;
7701 int pcount = 1;
7702
7703 if (dict == NULL) {
7704 PyErr_SetString(PyExc_TypeError,
Tim Petersced69f82003-09-16 20:30:58 +00007705 "format requires a mapping");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007706 goto onError;
7707 }
7708 ++fmt;
7709 --fmtcnt;
7710 keystart = fmt;
7711 /* Skip over balanced parentheses */
7712 while (pcount > 0 && --fmtcnt >= 0) {
7713 if (*fmt == ')')
7714 --pcount;
7715 else if (*fmt == '(')
7716 ++pcount;
7717 fmt++;
7718 }
7719 keylen = fmt - keystart - 1;
7720 if (fmtcnt < 0 || pcount > 0) {
7721 PyErr_SetString(PyExc_ValueError,
7722 "incomplete format key");
7723 goto onError;
7724 }
Marc-André Lemburg72f82132001-11-20 15:18:49 +00007725#if 0
Fred Drakee4315f52000-05-09 19:53:39 +00007726 /* keys are converted to strings using UTF-8 and
Guido van Rossumd57fd912000-03-10 22:53:23 +00007727 then looked up since Python uses strings to hold
7728 variables names etc. in its namespaces and we
Fred Drakee4315f52000-05-09 19:53:39 +00007729 wouldn't want to break common idioms. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007730 key = PyUnicode_EncodeUTF8(keystart,
7731 keylen,
7732 NULL);
Marc-André Lemburg72f82132001-11-20 15:18:49 +00007733#else
7734 key = PyUnicode_FromUnicode(keystart, keylen);
7735#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00007736 if (key == NULL)
7737 goto onError;
7738 if (args_owned) {
7739 Py_DECREF(args);
7740 args_owned = 0;
7741 }
7742 args = PyObject_GetItem(dict, key);
7743 Py_DECREF(key);
7744 if (args == NULL) {
7745 goto onError;
7746 }
7747 args_owned = 1;
7748 arglen = -1;
7749 argidx = -2;
7750 }
7751 while (--fmtcnt >= 0) {
7752 switch (c = *fmt++) {
7753 case '-': flags |= F_LJUST; continue;
7754 case '+': flags |= F_SIGN; continue;
7755 case ' ': flags |= F_BLANK; continue;
7756 case '#': flags |= F_ALT; continue;
7757 case '0': flags |= F_ZERO; continue;
7758 }
7759 break;
7760 }
7761 if (c == '*') {
7762 v = getnextarg(args, arglen, &argidx);
7763 if (v == NULL)
7764 goto onError;
7765 if (!PyInt_Check(v)) {
7766 PyErr_SetString(PyExc_TypeError,
7767 "* wants int");
7768 goto onError;
7769 }
7770 width = PyInt_AsLong(v);
Guido van Rossumddefaf32007-01-14 03:31:43 +00007771 if (width == -1 && PyErr_Occurred())
7772 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007773 if (width < 0) {
7774 flags |= F_LJUST;
7775 width = -width;
7776 }
7777 if (--fmtcnt >= 0)
7778 c = *fmt++;
7779 }
7780 else if (c >= '0' && c <= '9') {
7781 width = c - '0';
7782 while (--fmtcnt >= 0) {
7783 c = *fmt++;
7784 if (c < '0' || c > '9')
7785 break;
7786 if ((width*10) / 10 != width) {
7787 PyErr_SetString(PyExc_ValueError,
7788 "width too big");
7789 goto onError;
7790 }
7791 width = width*10 + (c - '0');
7792 }
7793 }
7794 if (c == '.') {
7795 prec = 0;
7796 if (--fmtcnt >= 0)
7797 c = *fmt++;
7798 if (c == '*') {
7799 v = getnextarg(args, arglen, &argidx);
7800 if (v == NULL)
7801 goto onError;
7802 if (!PyInt_Check(v)) {
7803 PyErr_SetString(PyExc_TypeError,
7804 "* wants int");
7805 goto onError;
7806 }
7807 prec = PyInt_AsLong(v);
Guido van Rossumddefaf32007-01-14 03:31:43 +00007808 if (prec == -1 && PyErr_Occurred())
7809 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007810 if (prec < 0)
7811 prec = 0;
7812 if (--fmtcnt >= 0)
7813 c = *fmt++;
7814 }
7815 else if (c >= '0' && c <= '9') {
7816 prec = c - '0';
7817 while (--fmtcnt >= 0) {
7818 c = Py_CHARMASK(*fmt++);
7819 if (c < '0' || c > '9')
7820 break;
7821 if ((prec*10) / 10 != prec) {
7822 PyErr_SetString(PyExc_ValueError,
7823 "prec too big");
7824 goto onError;
7825 }
7826 prec = prec*10 + (c - '0');
7827 }
7828 }
7829 } /* prec */
7830 if (fmtcnt >= 0) {
7831 if (c == 'h' || c == 'l' || c == 'L') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007832 if (--fmtcnt >= 0)
7833 c = *fmt++;
7834 }
7835 }
7836 if (fmtcnt < 0) {
7837 PyErr_SetString(PyExc_ValueError,
7838 "incomplete format");
7839 goto onError;
7840 }
7841 if (c != '%') {
7842 v = getnextarg(args, arglen, &argidx);
7843 if (v == NULL)
7844 goto onError;
7845 }
7846 sign = 0;
7847 fill = ' ';
7848 switch (c) {
7849
7850 case '%':
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007851 pbuf = formatbuf;
7852 /* presume that buffer length is at least 1 */
7853 pbuf[0] = '%';
Guido van Rossumd57fd912000-03-10 22:53:23 +00007854 len = 1;
7855 break;
7856
7857 case 's':
7858 case 'r':
7859 if (PyUnicode_Check(v) && c == 's') {
7860 temp = v;
7861 Py_INCREF(temp);
7862 }
7863 else {
7864 PyObject *unicode;
7865 if (c == 's')
Marc-André Lemburgd25c6502004-07-23 16:13:25 +00007866 temp = PyObject_Unicode(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007867 else
7868 temp = PyObject_Repr(v);
7869 if (temp == NULL)
7870 goto onError;
Marc-André Lemburgd25c6502004-07-23 16:13:25 +00007871 if (PyUnicode_Check(temp))
7872 /* nothing to do */;
7873 else if (PyString_Check(temp)) {
7874 /* convert to string to Unicode */
Thomas Woutersa96affe2006-03-12 00:29:36 +00007875 unicode = PyUnicode_Decode(PyString_AS_STRING(temp),
Guido van Rossumd57fd912000-03-10 22:53:23 +00007876 PyString_GET_SIZE(temp),
Thomas Woutersa96affe2006-03-12 00:29:36 +00007877 NULL,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007878 "strict");
Thomas Woutersa96affe2006-03-12 00:29:36 +00007879 Py_DECREF(temp);
7880 temp = unicode;
7881 if (temp == NULL)
7882 goto onError;
7883 }
Marc-André Lemburgd25c6502004-07-23 16:13:25 +00007884 else {
7885 Py_DECREF(temp);
7886 PyErr_SetString(PyExc_TypeError,
7887 "%s argument has non-string str()");
7888 goto onError;
7889 }
7890 }
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007891 pbuf = PyUnicode_AS_UNICODE(temp);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007892 len = PyUnicode_GET_SIZE(temp);
7893 if (prec >= 0 && len > prec)
7894 len = prec;
7895 break;
7896
7897 case 'i':
7898 case 'd':
7899 case 'u':
7900 case 'o':
7901 case 'x':
7902 case 'X':
7903 if (c == 'i')
7904 c = 'd';
Tim Petersa3a3a032000-11-30 05:22:44 +00007905 if (PyLong_Check(v)) {
Tim Peters38fd5b62000-09-21 05:43:11 +00007906 temp = formatlong(v, flags, prec, c);
7907 if (!temp)
7908 goto onError;
7909 pbuf = PyUnicode_AS_UNICODE(temp);
7910 len = PyUnicode_GET_SIZE(temp);
Tim Peters38fd5b62000-09-21 05:43:11 +00007911 sign = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007912 }
Tim Peters38fd5b62000-09-21 05:43:11 +00007913 else {
7914 pbuf = formatbuf;
7915 len = formatint(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
7916 flags, prec, c, v);
7917 if (len < 0)
7918 goto onError;
Guido van Rossum6c9e1302003-11-29 23:52:13 +00007919 sign = 1;
Tim Peters38fd5b62000-09-21 05:43:11 +00007920 }
7921 if (flags & F_ZERO)
7922 fill = '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00007923 break;
7924
7925 case 'e':
7926 case 'E':
7927 case 'f':
Raymond Hettinger9bfe5332003-08-27 04:55:52 +00007928 case 'F':
Guido van Rossumd57fd912000-03-10 22:53:23 +00007929 case 'g':
7930 case 'G':
Raymond Hettinger9bfe5332003-08-27 04:55:52 +00007931 if (c == 'F')
7932 c = 'f';
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007933 pbuf = formatbuf;
7934 len = formatfloat(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
7935 flags, prec, c, v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007936 if (len < 0)
7937 goto onError;
7938 sign = 1;
Tim Peters38fd5b62000-09-21 05:43:11 +00007939 if (flags & F_ZERO)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007940 fill = '0';
7941 break;
7942
7943 case 'c':
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007944 pbuf = formatbuf;
7945 len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007946 if (len < 0)
7947 goto onError;
7948 break;
7949
7950 default:
7951 PyErr_Format(PyExc_ValueError,
Andrew M. Kuchling6ca89172000-12-15 13:07:46 +00007952 "unsupported format character '%c' (0x%x) "
Thomas Wouters89f507f2006-12-13 04:49:30 +00007953 "at index %zd",
Tim Petersced69f82003-09-16 20:30:58 +00007954 (31<=c && c<=126) ? (char)c : '?',
Marc-André Lemburg24e53b62002-09-24 09:32:14 +00007955 (int)c,
Thomas Wouters89f507f2006-12-13 04:49:30 +00007956 (Py_ssize_t)(fmt - 1 -
7957 PyUnicode_AS_UNICODE(uformat)));
Guido van Rossumd57fd912000-03-10 22:53:23 +00007958 goto onError;
7959 }
7960 if (sign) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007961 if (*pbuf == '-' || *pbuf == '+') {
7962 sign = *pbuf++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007963 len--;
7964 }
7965 else if (flags & F_SIGN)
7966 sign = '+';
7967 else if (flags & F_BLANK)
7968 sign = ' ';
7969 else
7970 sign = 0;
7971 }
7972 if (width < len)
7973 width = len;
Guido van Rossum049cd6b2002-10-11 00:43:48 +00007974 if (rescnt - (sign != 0) < width) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007975 reslen -= rescnt;
7976 rescnt = width + fmtcnt + 100;
7977 reslen += rescnt;
Guido van Rossum049cd6b2002-10-11 00:43:48 +00007978 if (reslen < 0) {
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00007979 Py_XDECREF(temp);
Thomas Woutersa96affe2006-03-12 00:29:36 +00007980 PyErr_NoMemory();
7981 goto onError;
Guido van Rossum049cd6b2002-10-11 00:43:48 +00007982 }
Thomas Woutersa96affe2006-03-12 00:29:36 +00007983 if (_PyUnicode_Resize(&result, reslen) < 0) {
7984 Py_XDECREF(temp);
7985 goto onError;
7986 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007987 res = PyUnicode_AS_UNICODE(result)
7988 + reslen - rescnt;
7989 }
7990 if (sign) {
7991 if (fill != ' ')
7992 *res++ = sign;
7993 rescnt--;
7994 if (width > len)
7995 width--;
7996 }
Tim Peters38fd5b62000-09-21 05:43:11 +00007997 if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
7998 assert(pbuf[0] == '0');
Tim Petersfff53252001-04-12 18:38:48 +00007999 assert(pbuf[1] == c);
8000 if (fill != ' ') {
8001 *res++ = *pbuf++;
8002 *res++ = *pbuf++;
Tim Peters38fd5b62000-09-21 05:43:11 +00008003 }
Tim Petersfff53252001-04-12 18:38:48 +00008004 rescnt -= 2;
8005 width -= 2;
8006 if (width < 0)
8007 width = 0;
8008 len -= 2;
Tim Peters38fd5b62000-09-21 05:43:11 +00008009 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008010 if (width > len && !(flags & F_LJUST)) {
8011 do {
8012 --rescnt;
8013 *res++ = fill;
8014 } while (--width > len);
8015 }
Tim Peters38fd5b62000-09-21 05:43:11 +00008016 if (fill == ' ') {
8017 if (sign)
8018 *res++ = sign;
Tim Petersfff53252001-04-12 18:38:48 +00008019 if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
Tim Peters38fd5b62000-09-21 05:43:11 +00008020 assert(pbuf[0] == '0');
Tim Petersfff53252001-04-12 18:38:48 +00008021 assert(pbuf[1] == c);
Tim Peters38fd5b62000-09-21 05:43:11 +00008022 *res++ = *pbuf++;
8023 *res++ = *pbuf++;
8024 }
8025 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008026 Py_UNICODE_COPY(res, pbuf, len);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008027 res += len;
8028 rescnt -= len;
8029 while (--width >= len) {
8030 --rescnt;
8031 *res++ = ' ';
8032 }
8033 if (dict && (argidx < arglen) && c != '%') {
8034 PyErr_SetString(PyExc_TypeError,
Raymond Hettinger0ebac972002-05-21 15:14:57 +00008035 "not all arguments converted during string formatting");
Thomas Woutersa96affe2006-03-12 00:29:36 +00008036 Py_XDECREF(temp);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008037 goto onError;
8038 }
8039 Py_XDECREF(temp);
8040 } /* '%' */
8041 } /* until end */
8042 if (argidx < arglen && !dict) {
8043 PyErr_SetString(PyExc_TypeError,
Raymond Hettinger0ebac972002-05-21 15:14:57 +00008044 "not all arguments converted during string formatting");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008045 goto onError;
8046 }
8047
Thomas Woutersa96affe2006-03-12 00:29:36 +00008048 if (_PyUnicode_Resize(&result, reslen - rescnt) < 0)
8049 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008050 if (args_owned) {
8051 Py_DECREF(args);
8052 }
8053 Py_DECREF(uformat);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008054 return (PyObject *)result;
8055
8056 onError:
8057 Py_XDECREF(result);
8058 Py_DECREF(uformat);
8059 if (args_owned) {
8060 Py_DECREF(args);
8061 }
8062 return NULL;
8063}
8064
8065static PyBufferProcs unicode_as_buffer = {
Martin v. Löwis18e16552006-02-15 17:27:45 +00008066 (readbufferproc) unicode_buffer_getreadbuf,
8067 (writebufferproc) unicode_buffer_getwritebuf,
8068 (segcountproc) unicode_buffer_getsegcount,
8069 (charbufferproc) unicode_buffer_getcharbuf,
Guido van Rossumd57fd912000-03-10 22:53:23 +00008070};
8071
Jeremy Hylton938ace62002-07-17 16:30:39 +00008072static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +00008073unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
8074
Tim Peters6d6c1a32001-08-02 04:15:00 +00008075static PyObject *
8076unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
8077{
8078 PyObject *x = NULL;
Martin v. Löwis15e62742006-02-27 16:46:16 +00008079 static char *kwlist[] = {"string", "encoding", "errors", 0};
Tim Peters6d6c1a32001-08-02 04:15:00 +00008080 char *encoding = NULL;
8081 char *errors = NULL;
8082
Guido van Rossume023fe02001-08-30 03:12:59 +00008083 if (type != &PyUnicode_Type)
8084 return unicode_subtype_new(type, args, kwds);
Tim Peters6d6c1a32001-08-02 04:15:00 +00008085 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:unicode",
8086 kwlist, &x, &encoding, &errors))
8087 return NULL;
8088 if (x == NULL)
8089 return (PyObject *)_PyUnicode_New(0);
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00008090 if (encoding == NULL && errors == NULL)
8091 return PyObject_Unicode(x);
8092 else
Tim Peters6d6c1a32001-08-02 04:15:00 +00008093 return PyUnicode_FromEncodedObject(x, encoding, errors);
8094}
8095
Guido van Rossume023fe02001-08-30 03:12:59 +00008096static PyObject *
8097unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
8098{
Tim Petersaf90b3e2001-09-12 05:18:58 +00008099 PyUnicodeObject *tmp, *pnew;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008100 Py_ssize_t n;
Guido van Rossume023fe02001-08-30 03:12:59 +00008101
8102 assert(PyType_IsSubtype(type, &PyUnicode_Type));
8103 tmp = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds);
8104 if (tmp == NULL)
8105 return NULL;
8106 assert(PyUnicode_Check(tmp));
Tim Petersaf90b3e2001-09-12 05:18:58 +00008107 pnew = (PyUnicodeObject *) type->tp_alloc(type, n = tmp->length);
Raymond Hettingerf4667932003-06-28 20:04:25 +00008108 if (pnew == NULL) {
8109 Py_DECREF(tmp);
Guido van Rossume023fe02001-08-30 03:12:59 +00008110 return NULL;
Raymond Hettingerf4667932003-06-28 20:04:25 +00008111 }
Tim Petersaf90b3e2001-09-12 05:18:58 +00008112 pnew->str = PyMem_NEW(Py_UNICODE, n+1);
8113 if (pnew->str == NULL) {
8114 _Py_ForgetReference((PyObject *)pnew);
Neil Schemenauer58aa8612002-04-12 03:07:20 +00008115 PyObject_Del(pnew);
Raymond Hettingerf4667932003-06-28 20:04:25 +00008116 Py_DECREF(tmp);
Neal Norwitzec74f2f2003-02-11 23:05:40 +00008117 return PyErr_NoMemory();
Guido van Rossume023fe02001-08-30 03:12:59 +00008118 }
Tim Petersaf90b3e2001-09-12 05:18:58 +00008119 Py_UNICODE_COPY(pnew->str, tmp->str, n+1);
8120 pnew->length = n;
8121 pnew->hash = tmp->hash;
Guido van Rossume023fe02001-08-30 03:12:59 +00008122 Py_DECREF(tmp);
Tim Petersaf90b3e2001-09-12 05:18:58 +00008123 return (PyObject *)pnew;
Guido van Rossume023fe02001-08-30 03:12:59 +00008124}
8125
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008126PyDoc_STRVAR(unicode_doc,
Tim Peters6d6c1a32001-08-02 04:15:00 +00008127"unicode(string [, encoding[, errors]]) -> object\n\
8128\n\
8129Create a new Unicode object from the given encoded string.\n\
Skip Montanaro35b37a52002-07-26 16:22:46 +00008130encoding defaults to the current default string encoding.\n\
8131errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +00008132
Guido van Rossum50e9fb92006-08-17 05:42:55 +00008133static PyObject *unicode_iter(PyObject *seq);
8134
Guido van Rossumd57fd912000-03-10 22:53:23 +00008135PyTypeObject PyUnicode_Type = {
8136 PyObject_HEAD_INIT(&PyType_Type)
8137 0, /* ob_size */
Guido van Rossum84fc66d2007-05-03 17:18:26 +00008138 "str", /* tp_name */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008139 sizeof(PyUnicodeObject), /* tp_size */
8140 0, /* tp_itemsize */
8141 /* Slots */
Guido van Rossum9475a232001-10-05 20:51:39 +00008142 (destructor)unicode_dealloc, /* tp_dealloc */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008143 0, /* tp_print */
Tim Peters6d6c1a32001-08-02 04:15:00 +00008144 0, /* tp_getattr */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008145 0, /* tp_setattr */
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00008146 0, /* tp_compare */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008147 unicode_repr, /* tp_repr */
Neil Schemenauerce30bc92002-11-18 16:10:18 +00008148 &unicode_as_number, /* tp_as_number */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008149 &unicode_as_sequence, /* tp_as_sequence */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008150 &unicode_as_mapping, /* tp_as_mapping */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008151 (hashfunc) unicode_hash, /* tp_hash*/
8152 0, /* tp_call*/
8153 (reprfunc) unicode_str, /* tp_str */
Tim Peters6d6c1a32001-08-02 04:15:00 +00008154 PyObject_GenericGetAttr, /* tp_getattro */
8155 0, /* tp_setattro */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008156 &unicode_as_buffer, /* tp_as_buffer */
Thomas Wouters27d517b2007-02-25 20:39:11 +00008157 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
8158 Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
Tim Peters6d6c1a32001-08-02 04:15:00 +00008159 unicode_doc, /* tp_doc */
8160 0, /* tp_traverse */
8161 0, /* tp_clear */
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00008162 PyUnicode_RichCompare, /* tp_richcompare */
Tim Peters6d6c1a32001-08-02 04:15:00 +00008163 0, /* tp_weaklistoffset */
Guido van Rossum50e9fb92006-08-17 05:42:55 +00008164 unicode_iter, /* tp_iter */
Tim Peters6d6c1a32001-08-02 04:15:00 +00008165 0, /* tp_iternext */
8166 unicode_methods, /* tp_methods */
8167 0, /* tp_members */
8168 0, /* tp_getset */
Guido van Rossumcacfc072002-05-24 19:01:59 +00008169 &PyBaseString_Type, /* tp_base */
Tim Peters6d6c1a32001-08-02 04:15:00 +00008170 0, /* tp_dict */
8171 0, /* tp_descr_get */
8172 0, /* tp_descr_set */
8173 0, /* tp_dictoffset */
8174 0, /* tp_init */
8175 0, /* tp_alloc */
8176 unicode_new, /* tp_new */
Neil Schemenauer58aa8612002-04-12 03:07:20 +00008177 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008178};
8179
8180/* Initialize the Unicode implementation */
8181
Thomas Wouters78890102000-07-22 19:25:51 +00008182void _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008183{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008184 int i;
8185
Thomas Wouters477c8d52006-05-27 19:21:47 +00008186 /* XXX - move this array to unicodectype.c ? */
8187 Py_UNICODE linebreak[] = {
8188 0x000A, /* LINE FEED */
8189 0x000D, /* CARRIAGE RETURN */
8190 0x001C, /* FILE SEPARATOR */
8191 0x001D, /* GROUP SEPARATOR */
8192 0x001E, /* RECORD SEPARATOR */
8193 0x0085, /* NEXT LINE */
8194 0x2028, /* LINE SEPARATOR */
8195 0x2029, /* PARAGRAPH SEPARATOR */
8196 };
8197
Fred Drakee4315f52000-05-09 19:53:39 +00008198 /* Init the implementation */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008199 unicode_freelist = NULL;
8200 unicode_freelist_size = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008201 unicode_empty = _PyUnicode_New(0);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008202 if (!unicode_empty)
8203 return;
8204
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008205 for (i = 0; i < 256; i++)
8206 unicode_latin1[i] = NULL;
Guido van Rossumcacfc072002-05-24 19:01:59 +00008207 if (PyType_Ready(&PyUnicode_Type) < 0)
8208 Py_FatalError("Can't initialize 'unicode'");
Thomas Wouters477c8d52006-05-27 19:21:47 +00008209
8210 /* initialize the linebreak bloom filter */
8211 bloom_linebreak = make_bloom_mask(
8212 linebreak, sizeof(linebreak) / sizeof(linebreak[0])
8213 );
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008214
8215 PyType_Ready(&EncodingMapType);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008216}
8217
8218/* Finalize the Unicode implementation */
8219
8220void
Thomas Wouters78890102000-07-22 19:25:51 +00008221_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008222{
Barry Warsaw5b4c2282000-10-03 20:45:26 +00008223 PyUnicodeObject *u;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008224 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008225
Guido van Rossum4ae8ef82000-10-03 18:09:04 +00008226 Py_XDECREF(unicode_empty);
8227 unicode_empty = NULL;
Barry Warsaw5b4c2282000-10-03 20:45:26 +00008228
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008229 for (i = 0; i < 256; i++) {
8230 if (unicode_latin1[i]) {
8231 Py_DECREF(unicode_latin1[i]);
8232 unicode_latin1[i] = NULL;
8233 }
8234 }
8235
Barry Warsaw5b4c2282000-10-03 20:45:26 +00008236 for (u = unicode_freelist; u != NULL;) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00008237 PyUnicodeObject *v = u;
8238 u = *(PyUnicodeObject **)u;
Guido van Rossumfd4b9572000-04-10 13:51:10 +00008239 if (v->str)
Guido van Rossumb18618d2000-05-03 23:44:39 +00008240 PyMem_DEL(v->str);
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00008241 Py_XDECREF(v->defenc);
Neil Schemenauer58aa8612002-04-12 03:07:20 +00008242 PyObject_Del(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008243 }
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008244 unicode_freelist = NULL;
8245 unicode_freelist_size = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008246}
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00008247
Guido van Rossum50e9fb92006-08-17 05:42:55 +00008248
8249
8250/********************* Unicode Iterator **************************/
8251
8252typedef struct {
8253 PyObject_HEAD
Guido van Rossum49d6b072006-08-17 21:11:47 +00008254 Py_ssize_t it_index;
Guido van Rossum50e9fb92006-08-17 05:42:55 +00008255 PyUnicodeObject *it_seq; /* Set to NULL when iterator is exhausted */
8256} unicodeiterobject;
8257
8258static void
8259unicodeiter_dealloc(unicodeiterobject *it)
8260{
8261 _PyObject_GC_UNTRACK(it);
8262 Py_XDECREF(it->it_seq);
8263 PyObject_GC_Del(it);
8264}
8265
8266static int
8267unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
8268{
8269 Py_VISIT(it->it_seq);
8270 return 0;
8271}
8272
8273static PyObject *
8274unicodeiter_next(unicodeiterobject *it)
8275{
8276 PyUnicodeObject *seq;
8277 PyObject *item;
8278
8279 assert(it != NULL);
8280 seq = it->it_seq;
8281 if (seq == NULL)
8282 return NULL;
8283 assert(PyUnicode_Check(seq));
8284
8285 if (it->it_index < PyUnicode_GET_SIZE(seq)) {
Guido van Rossum49d6b072006-08-17 21:11:47 +00008286 item = PyUnicode_FromUnicode(
8287 PyUnicode_AS_UNICODE(seq)+it->it_index, 1);
Guido van Rossum50e9fb92006-08-17 05:42:55 +00008288 if (item != NULL)
8289 ++it->it_index;
8290 return item;
8291 }
8292
8293 Py_DECREF(seq);
8294 it->it_seq = NULL;
8295 return NULL;
8296}
8297
8298static PyObject *
8299unicodeiter_len(unicodeiterobject *it)
8300{
8301 Py_ssize_t len = 0;
8302 if (it->it_seq)
8303 len = PyUnicode_GET_SIZE(it->it_seq) - it->it_index;
8304 return PyInt_FromSsize_t(len);
8305}
8306
8307PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
8308
8309static PyMethodDef unicodeiter_methods[] = {
Guido van Rossum49d6b072006-08-17 21:11:47 +00008310 {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
8311 length_hint_doc},
Guido van Rossum50e9fb92006-08-17 05:42:55 +00008312 {NULL, NULL} /* sentinel */
8313};
8314
8315PyTypeObject PyUnicodeIter_Type = {
8316 PyObject_HEAD_INIT(&PyType_Type)
8317 0, /* ob_size */
8318 "unicodeiterator", /* tp_name */
8319 sizeof(unicodeiterobject), /* tp_basicsize */
8320 0, /* tp_itemsize */
8321 /* methods */
Guido van Rossum49d6b072006-08-17 21:11:47 +00008322 (destructor)unicodeiter_dealloc, /* tp_dealloc */
Guido van Rossum50e9fb92006-08-17 05:42:55 +00008323 0, /* tp_print */
8324 0, /* tp_getattr */
8325 0, /* tp_setattr */
8326 0, /* tp_compare */
8327 0, /* tp_repr */
8328 0, /* tp_as_number */
8329 0, /* tp_as_sequence */
8330 0, /* tp_as_mapping */
8331 0, /* tp_hash */
8332 0, /* tp_call */
8333 0, /* tp_str */
8334 PyObject_GenericGetAttr, /* tp_getattro */
8335 0, /* tp_setattro */
8336 0, /* tp_as_buffer */
8337 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
8338 0, /* tp_doc */
8339 (traverseproc)unicodeiter_traverse, /* tp_traverse */
8340 0, /* tp_clear */
8341 0, /* tp_richcompare */
8342 0, /* tp_weaklistoffset */
8343 PyObject_SelfIter, /* tp_iter */
8344 (iternextfunc)unicodeiter_next, /* tp_iternext */
8345 unicodeiter_methods, /* tp_methods */
8346 0,
8347};
8348
8349static PyObject *
8350unicode_iter(PyObject *seq)
8351{
8352 unicodeiterobject *it;
8353
8354 if (!PyUnicode_Check(seq)) {
8355 PyErr_BadInternalCall();
8356 return NULL;
8357 }
8358 it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
8359 if (it == NULL)
8360 return NULL;
8361 it->it_index = 0;
8362 Py_INCREF(seq);
8363 it->it_seq = (PyUnicodeObject *)seq;
8364 _PyObject_GC_TRACK(it);
8365 return (PyObject *)it;
8366}
8367
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008368#ifdef __cplusplus
8369}
8370#endif
8371
8372
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00008373/*
8374Local variables:
8375c-basic-offset: 4
8376indent-tabs-mode: nil
8377End:
8378*/