blob: df84eb18e347a96fe1e3c295ccbddb8b4ebd594b [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Fred Drake785d14f2000-05-09 19:54:43 +00004modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
Guido van Rossumd57fd912000-03-10 22:53:23 +00005Unicode Integration Proposal (see file Misc/unicode.txt).
6
Thomas Wouters477c8d52006-05-27 19:21:47 +00007Major speed upgrades to the method implementations at the Reykjavik
8NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
9
Guido van Rossum16b1ad92000-08-03 16:24:25 +000010Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000011
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000012--------------------------------------------------------------------
13The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000014
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000015 Copyright (c) 1999 by Secret Labs AB
16 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000017
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000018By obtaining, using, and/or copying this software and/or its
19associated documentation, you agree that you have read, understood,
20and will comply with the following terms and conditions:
21
22Permission to use, copy, modify, and distribute this software and its
23associated documentation for any purpose and without fee is hereby
24granted, provided that the above copyright notice appears in all
25copies, and that both that copyright notice and this permission notice
26appear in supporting documentation, and that the name of Secret Labs
27AB or the author not be used in advertising or publicity pertaining to
28distribution of the software without specific, written prior
29permission.
30
31SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
32THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
33FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
34ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
35WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
36ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
37OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
38--------------------------------------------------------------------
39
40*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000041
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000042#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000043#include "Python.h"
44
Guido van Rossumd57fd912000-03-10 22:53:23 +000045#include "unicodeobject.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000046#include "ucnhash.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000047
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000048#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000049#include <windows.h>
50#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000051
Guido van Rossumd57fd912000-03-10 22:53:23 +000052/* Limit for the Unicode object free list */
53
54#define MAX_UNICODE_FREELIST_SIZE 1024
55
56/* Limit for the Unicode object free list stay alive optimization.
57
58 The implementation will keep allocated Unicode memory intact for
59 all objects on the free list having a size less than this
Tim Petersced69f82003-09-16 20:30:58 +000060 limit. This reduces malloc() overhead for small Unicode objects.
Guido van Rossumd57fd912000-03-10 22:53:23 +000061
Barry Warsaw51ac5802000-03-20 16:36:48 +000062 At worst this will result in MAX_UNICODE_FREELIST_SIZE *
Guido van Rossumfd4b9572000-04-10 13:51:10 +000063 (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
Guido van Rossumd57fd912000-03-10 22:53:23 +000064 malloc()-overhead) bytes of unused garbage.
65
66 Setting the limit to 0 effectively turns the feature off.
67
Guido van Rossumfd4b9572000-04-10 13:51:10 +000068 Note: This is an experimental feature ! If you get core dumps when
69 using Unicode objects, turn this feature off.
Guido van Rossumd57fd912000-03-10 22:53:23 +000070
71*/
72
Guido van Rossumfd4b9572000-04-10 13:51:10 +000073#define KEEPALIVE_SIZE_LIMIT 9
Guido van Rossumd57fd912000-03-10 22:53:23 +000074
75/* Endianness switches; defaults to little endian */
76
77#ifdef WORDS_BIGENDIAN
78# define BYTEORDER_IS_BIG_ENDIAN
79#else
80# define BYTEORDER_IS_LITTLE_ENDIAN
81#endif
82
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000083/* --- Globals ------------------------------------------------------------
84
85 The globals are initialized by the _PyUnicode_Init() API and should
86 not be used before calling that API.
87
88*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000089
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000090
91#ifdef __cplusplus
92extern "C" {
93#endif
94
Guido van Rossumd57fd912000-03-10 22:53:23 +000095/* Free list for Unicode objects */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000096static PyUnicodeObject *unicode_freelist;
97static int unicode_freelist_size;
Guido van Rossumd57fd912000-03-10 22:53:23 +000098
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000099/* The empty Unicode object is shared to improve performance. */
100static PyUnicodeObject *unicode_empty;
101
102/* Single character Unicode strings in the Latin-1 range are being
103 shared as well. */
104static PyUnicodeObject *unicode_latin1[256];
105
Fred Drakee4315f52000-05-09 19:53:39 +0000106/* Default encoding to use and assume when NULL is passed as encoding
Guido van Rossumf15a29f2007-05-04 00:41:39 +0000107 parameter; it is fixed to "utf-8". Always use the
108 PyUnicode_GetDefaultEncoding() API to access this global. */
109static const char unicode_default_encoding[] = "utf-8";
Fred Drakee4315f52000-05-09 19:53:39 +0000110
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000111Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000112PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000113{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000114#ifdef Py_UNICODE_WIDE
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000115 return 0x10FFFF;
116#else
117 /* This is actually an illegal character, so it should
118 not be passed to unichr. */
119 return 0xFFFF;
120#endif
121}
122
Thomas Wouters477c8d52006-05-27 19:21:47 +0000123/* --- Bloom Filters ----------------------------------------------------- */
124
125/* stuff to implement simple "bloom filters" for Unicode characters.
126 to keep things simple, we use a single bitmask, using the least 5
127 bits from each unicode characters as the bit index. */
128
129/* the linebreak mask is set up by Unicode_Init below */
130
131#define BLOOM_MASK unsigned long
132
133static BLOOM_MASK bloom_linebreak;
134
135#define BLOOM(mask, ch) ((mask & (1 << ((ch) & 0x1F))))
136
137#define BLOOM_LINEBREAK(ch)\
138 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK((ch)))
139
140Py_LOCAL_INLINE(BLOOM_MASK) make_bloom_mask(Py_UNICODE* ptr, Py_ssize_t len)
141{
142 /* calculate simple bloom-style bitmask for a given unicode string */
143
144 long mask;
145 Py_ssize_t i;
146
147 mask = 0;
148 for (i = 0; i < len; i++)
149 mask |= (1 << (ptr[i] & 0x1F));
150
151 return mask;
152}
153
154Py_LOCAL_INLINE(int) unicode_member(Py_UNICODE chr, Py_UNICODE* set, Py_ssize_t setlen)
155{
156 Py_ssize_t i;
157
158 for (i = 0; i < setlen; i++)
159 if (set[i] == chr)
160 return 1;
161
162 return 0;
163}
164
165#define BLOOM_MEMBER(mask, chr, set, setlen)\
166 BLOOM(mask, chr) && unicode_member(chr, set, setlen)
167
Guido van Rossumd57fd912000-03-10 22:53:23 +0000168/* --- Unicode Object ----------------------------------------------------- */
169
170static
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000171int unicode_resize(register PyUnicodeObject *unicode,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000172 Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000173{
174 void *oldstr;
Tim Petersced69f82003-09-16 20:30:58 +0000175
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000176 /* Shortcut if there's nothing much to do. */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000177 if (unicode->length == length)
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000178 goto reset;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000179
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000180 /* Resizing shared object (unicode_empty or single character
181 objects) in-place is not allowed. Use PyUnicode_Resize()
182 instead ! */
Thomas Wouters477c8d52006-05-27 19:21:47 +0000183
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000184 if (unicode == unicode_empty ||
185 (unicode->length == 1 &&
186 unicode->str[0] < 256U &&
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000187 unicode_latin1[unicode->str[0]] == unicode)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000188 PyErr_SetString(PyExc_SystemError,
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000189 "can't resize shared unicode objects");
Guido van Rossumd57fd912000-03-10 22:53:23 +0000190 return -1;
191 }
192
Thomas Wouters477c8d52006-05-27 19:21:47 +0000193 /* We allocate one more byte to make sure the string is Ux0000 terminated.
194 The overallocation is also used by fastsearch, which assumes that it's
195 safe to look at str[length] (without making any assumptions about what
196 it contains). */
197
Guido van Rossumd57fd912000-03-10 22:53:23 +0000198 oldstr = unicode->str;
199 PyMem_RESIZE(unicode->str, Py_UNICODE, length + 1);
200 if (!unicode->str) {
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000201 unicode->str = (Py_UNICODE *)oldstr;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000202 PyErr_NoMemory();
203 return -1;
204 }
205 unicode->str[length] = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000206 unicode->length = length;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000207
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000208 reset:
Guido van Rossumd57fd912000-03-10 22:53:23 +0000209 /* Reset the object caches */
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000210 if (unicode->defenc) {
211 Py_DECREF(unicode->defenc);
212 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000213 }
214 unicode->hash = -1;
Tim Petersced69f82003-09-16 20:30:58 +0000215
Guido van Rossumd57fd912000-03-10 22:53:23 +0000216 return 0;
217}
218
219/* We allocate one more byte to make sure the string is
Tim Petersced69f82003-09-16 20:30:58 +0000220 Ux0000 terminated -- XXX is this needed ?
Guido van Rossumd57fd912000-03-10 22:53:23 +0000221
222 XXX This allocator could further be enhanced by assuring that the
223 free list never reduces its size below 1.
224
225*/
226
227static
Martin v. Löwis18e16552006-02-15 17:27:45 +0000228PyUnicodeObject *_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000229{
230 register PyUnicodeObject *unicode;
231
Thomas Wouters477c8d52006-05-27 19:21:47 +0000232 /* Optimization for empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000233 if (length == 0 && unicode_empty != NULL) {
234 Py_INCREF(unicode_empty);
235 return unicode_empty;
236 }
237
238 /* Unicode freelist & memory allocation */
239 if (unicode_freelist) {
240 unicode = unicode_freelist;
Marc-André Lemburgbea47e72000-06-17 20:31:17 +0000241 unicode_freelist = *(PyUnicodeObject **)unicode;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000242 unicode_freelist_size--;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000243 if (unicode->str) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000244 /* Keep-Alive optimization: we only upsize the buffer,
245 never downsize it. */
Tim Petersced69f82003-09-16 20:30:58 +0000246 if ((unicode->length < length) &&
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +0000247 unicode_resize(unicode, length) < 0) {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000248 PyMem_DEL(unicode->str);
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000249 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000250 }
251 }
Guido van Rossumad98db12001-06-14 17:52:02 +0000252 else {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000253 unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
Guido van Rossumad98db12001-06-14 17:52:02 +0000254 }
255 PyObject_INIT(unicode, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000256 }
257 else {
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000258 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000259 if (unicode == NULL)
260 return NULL;
261 unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
262 }
263
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000264 if (!unicode->str) {
265 PyErr_NoMemory();
Barry Warsaw51ac5802000-03-20 16:36:48 +0000266 goto onError;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000267 }
Jeremy Hyltond8082792003-09-16 19:41:39 +0000268 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +0000269 * the caller fails before initializing str -- unicode_resize()
270 * reads str[0], and the Keep-Alive optimization can keep memory
271 * allocated for str alive across a call to unicode_dealloc(unicode).
272 * We don't want unicode_resize to read uninitialized memory in
273 * that case.
274 */
Jeremy Hyltond8082792003-09-16 19:41:39 +0000275 unicode->str[0] = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000276 unicode->str[length] = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000277 unicode->length = length;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000278 unicode->hash = -1;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000279 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000280 return unicode;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000281
282 onError:
283 _Py_ForgetReference((PyObject *)unicode);
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000284 PyObject_Del(unicode);
Barry Warsaw51ac5802000-03-20 16:36:48 +0000285 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000286}
287
288static
Guido van Rossum9475a232001-10-05 20:51:39 +0000289void unicode_dealloc(register PyUnicodeObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000290{
Guido van Rossum604ddf82001-12-06 20:03:56 +0000291 if (PyUnicode_CheckExact(unicode) &&
292 unicode_freelist_size < MAX_UNICODE_FREELIST_SIZE) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000293 /* Keep-Alive optimization */
294 if (unicode->length >= KEEPALIVE_SIZE_LIMIT) {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000295 PyMem_DEL(unicode->str);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000296 unicode->str = NULL;
297 unicode->length = 0;
298 }
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000299 if (unicode->defenc) {
300 Py_DECREF(unicode->defenc);
301 unicode->defenc = NULL;
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000302 }
303 /* Add to free list */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000304 *(PyUnicodeObject **)unicode = unicode_freelist;
305 unicode_freelist = unicode;
306 unicode_freelist_size++;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000307 }
308 else {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000309 PyMem_DEL(unicode->str);
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000310 Py_XDECREF(unicode->defenc);
Guido van Rossum604ddf82001-12-06 20:03:56 +0000311 unicode->ob_type->tp_free((PyObject *)unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000312 }
313}
314
Martin v. Löwis18e16552006-02-15 17:27:45 +0000315int PyUnicode_Resize(PyObject **unicode, Py_ssize_t length)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000316{
317 register PyUnicodeObject *v;
318
319 /* Argument checks */
320 if (unicode == NULL) {
321 PyErr_BadInternalCall();
322 return -1;
323 }
324 v = (PyUnicodeObject *)*unicode;
Guido van Rossum049cd6b2002-10-11 00:43:48 +0000325 if (v == NULL || !PyUnicode_Check(v) || v->ob_refcnt != 1 || length < 0) {
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000326 PyErr_BadInternalCall();
327 return -1;
328 }
329
330 /* Resizing unicode_empty and single character objects is not
331 possible since these are being shared. We simply return a fresh
332 copy with the same Unicode content. */
Tim Petersced69f82003-09-16 20:30:58 +0000333 if (v->length != length &&
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000334 (v == unicode_empty || v->length == 1)) {
335 PyUnicodeObject *w = _PyUnicode_New(length);
336 if (w == NULL)
337 return -1;
338 Py_UNICODE_COPY(w->str, v->str,
339 length < v->length ? length : v->length);
Raymond Hettingerc8df5782003-03-09 07:30:43 +0000340 Py_DECREF(*unicode);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000341 *unicode = (PyObject *)w;
342 return 0;
343 }
344
345 /* Note that we don't have to modify *unicode for unshared Unicode
346 objects, since we can modify them in-place. */
347 return unicode_resize(v, length);
348}
349
350/* Internal API for use in unicodeobject.c only ! */
351#define _PyUnicode_Resize(unicodevar, length) \
352 PyUnicode_Resize(((PyObject **)(unicodevar)), length)
353
Guido van Rossumd57fd912000-03-10 22:53:23 +0000354PyObject *PyUnicode_FromUnicode(const Py_UNICODE *u,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000355 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000356{
357 PyUnicodeObject *unicode;
358
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000359 /* If the Unicode data is known at construction time, we can apply
360 some optimizations which share commonly used objects. */
361 if (u != NULL) {
362
363 /* Optimization for empty strings */
364 if (size == 0 && unicode_empty != NULL) {
365 Py_INCREF(unicode_empty);
366 return (PyObject *)unicode_empty;
367 }
368
369 /* Single character Unicode objects in the Latin-1 range are
370 shared when using this constructor */
371 if (size == 1 && *u < 256) {
372 unicode = unicode_latin1[*u];
373 if (!unicode) {
374 unicode = _PyUnicode_New(1);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000375 if (!unicode)
376 return NULL;
Marc-André Lemburg8879a332001-06-07 12:26:56 +0000377 unicode->str[0] = *u;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000378 unicode_latin1[*u] = unicode;
379 }
380 Py_INCREF(unicode);
381 return (PyObject *)unicode;
382 }
383 }
Tim Petersced69f82003-09-16 20:30:58 +0000384
Guido van Rossumd57fd912000-03-10 22:53:23 +0000385 unicode = _PyUnicode_New(size);
386 if (!unicode)
387 return NULL;
388
389 /* Copy the Unicode data into the new object */
390 if (u != NULL)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000391 Py_UNICODE_COPY(unicode->str, u, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000392
393 return (PyObject *)unicode;
394}
395
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000396PyObject *PyUnicode_FromString(const char *u)
397{
398 PyUnicodeObject *unicode;
Walter Dörwalda14c4bb2007-05-06 10:00:02 +0000399 size_t size = strlen(u);
400 if (size > PY_SSIZE_T_MAX) {
401 PyErr_SetString(PyExc_OverflowError, "input too long");
402 return NULL;
403 }
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000404
405 /* If the Unicode data is known at construction time, we can apply
406 some optimizations which share commonly used objects. */
407 if (u != NULL) {
408
409 /* Optimization for empty strings */
410 if (size == 0 && unicode_empty != NULL) {
411 Py_INCREF(unicode_empty);
412 return (PyObject *)unicode_empty;
413 }
414
Walter Dörwald071b9da2007-05-05 14:21:20 +0000415 /* Single characters are shared when using this constructor */
416 if (size == 1) {
Walter Dörwaldce32db32007-05-05 14:26:59 +0000417 unicode = unicode_latin1[(int)*u];
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000418 if (!unicode) {
419 unicode = _PyUnicode_New(1);
420 if (!unicode)
421 return NULL;
422 unicode->str[0] = *u;
Walter Dörwaldce32db32007-05-05 14:26:59 +0000423 unicode_latin1[(int)*u] = unicode;
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000424 }
425 Py_INCREF(unicode);
426 return (PyObject *)unicode;
427 }
428 }
429
430 unicode = _PyUnicode_New(size);
431 if (!unicode)
432 return NULL;
433
434 /* Copy the Unicode data into the new object */
435 if (u != NULL) {
Walter Dörwaldce32db32007-05-05 14:26:59 +0000436 Py_UNICODE *p = unicode->str;
437 while ((*p++ = *u++))
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000438 ;
439 }
440
441 return (PyObject *)unicode;
442}
443
Guido van Rossumd57fd912000-03-10 22:53:23 +0000444#ifdef HAVE_WCHAR_H
445
446PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000447 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000448{
449 PyUnicodeObject *unicode;
450
451 if (w == NULL) {
452 PyErr_BadInternalCall();
453 return NULL;
454 }
455
456 unicode = _PyUnicode_New(size);
457 if (!unicode)
458 return NULL;
459
460 /* Copy the wchar_t data into the new object */
461#ifdef HAVE_USABLE_WCHAR_T
462 memcpy(unicode->str, w, size * sizeof(wchar_t));
Tim Petersced69f82003-09-16 20:30:58 +0000463#else
Guido van Rossumd57fd912000-03-10 22:53:23 +0000464 {
465 register Py_UNICODE *u;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000466 register Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000467 u = PyUnicode_AS_UNICODE(unicode);
Marc-André Lemburg204bd6d2004-10-15 07:45:05 +0000468 for (i = size; i > 0; i--)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000469 *u++ = *w++;
470 }
471#endif
472
473 return (PyObject *)unicode;
474}
475
Martin v. Löwis18e16552006-02-15 17:27:45 +0000476Py_ssize_t PyUnicode_AsWideChar(PyUnicodeObject *unicode,
477 wchar_t *w,
478 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000479{
480 if (unicode == NULL) {
481 PyErr_BadInternalCall();
482 return -1;
483 }
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +0000484
485 /* If possible, try to copy the 0-termination as well */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000486 if (size > PyUnicode_GET_SIZE(unicode))
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +0000487 size = PyUnicode_GET_SIZE(unicode) + 1;
488
Guido van Rossumd57fd912000-03-10 22:53:23 +0000489#ifdef HAVE_USABLE_WCHAR_T
490 memcpy(w, unicode->str, size * sizeof(wchar_t));
491#else
492 {
493 register Py_UNICODE *u;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000494 register Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000495 u = PyUnicode_AS_UNICODE(unicode);
Marc-André Lemburg204bd6d2004-10-15 07:45:05 +0000496 for (i = size; i > 0; i--)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000497 *w++ = *u++;
498 }
499#endif
500
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +0000501 if (size > PyUnicode_GET_SIZE(unicode))
502 return PyUnicode_GET_SIZE(unicode);
503 else
Guido van Rossumd57fd912000-03-10 22:53:23 +0000504 return size;
505}
506
507#endif
508
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000509PyObject *PyUnicode_FromOrdinal(int ordinal)
510{
Hye-Shik Chang40574832004-04-06 07:24:51 +0000511 Py_UNICODE s[1];
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000512
513#ifdef Py_UNICODE_WIDE
514 if (ordinal < 0 || ordinal > 0x10ffff) {
515 PyErr_SetString(PyExc_ValueError,
516 "unichr() arg not in range(0x110000) "
517 "(wide Python build)");
518 return NULL;
519 }
520#else
521 if (ordinal < 0 || ordinal > 0xffff) {
522 PyErr_SetString(PyExc_ValueError,
523 "unichr() arg not in range(0x10000) "
524 "(narrow Python build)");
525 return NULL;
526 }
527#endif
528
Hye-Shik Chang40574832004-04-06 07:24:51 +0000529 s[0] = (Py_UNICODE)ordinal;
530 return PyUnicode_FromUnicode(s, 1);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000531}
532
Guido van Rossumd57fd912000-03-10 22:53:23 +0000533PyObject *PyUnicode_FromObject(register PyObject *obj)
534{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000535 /* XXX Perhaps we should make this API an alias of
536 PyObject_Unicode() instead ?! */
537 if (PyUnicode_CheckExact(obj)) {
538 Py_INCREF(obj);
539 return obj;
540 }
541 if (PyUnicode_Check(obj)) {
542 /* For a Unicode subtype that's not a Unicode object,
543 return a true Unicode object with the same data. */
544 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(obj),
545 PyUnicode_GET_SIZE(obj));
546 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000547 return PyUnicode_FromEncodedObject(obj, NULL, "strict");
548}
549
550PyObject *PyUnicode_FromEncodedObject(register PyObject *obj,
551 const char *encoding,
552 const char *errors)
553{
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000554 const char *s = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +0000555 Py_ssize_t len;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000556 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +0000557
Guido van Rossumd57fd912000-03-10 22:53:23 +0000558 if (obj == NULL) {
559 PyErr_BadInternalCall();
560 return NULL;
561 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000562
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000563#if 0
564 /* For b/w compatibility we also accept Unicode objects provided
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +0000565 that no encodings is given and then redirect to
566 PyObject_Unicode() which then applies the additional logic for
567 Unicode subclasses.
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000568
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000569 NOTE: This API should really only be used for object which
570 represent *encoded* Unicode !
571
572 */
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000573 if (PyUnicode_Check(obj)) {
574 if (encoding) {
575 PyErr_SetString(PyExc_TypeError,
576 "decoding Unicode is not supported");
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000577 return NULL;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000578 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000579 return PyObject_Unicode(obj);
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000580 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000581#else
582 if (PyUnicode_Check(obj)) {
583 PyErr_SetString(PyExc_TypeError,
584 "decoding Unicode is not supported");
585 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000586 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000587#endif
588
589 /* Coerce object */
590 if (PyString_Check(obj)) {
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000591 s = PyString_AS_STRING(obj);
592 len = PyString_GET_SIZE(obj);
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000593 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000594 else if (PyObject_AsCharBuffer(obj, &s, &len)) {
595 /* Overwrite the error message with something more useful in
596 case of a TypeError. */
597 if (PyErr_ExceptionMatches(PyExc_TypeError))
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000598 PyErr_Format(PyExc_TypeError,
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000599 "coercing to Unicode: need string or buffer, "
600 "%.80s found",
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000601 obj->ob_type->tp_name);
602 goto onError;
603 }
Tim Petersced69f82003-09-16 20:30:58 +0000604
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000605 /* Convert to Unicode */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000606 if (len == 0) {
607 Py_INCREF(unicode_empty);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000608 v = (PyObject *)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000609 }
Tim Petersced69f82003-09-16 20:30:58 +0000610 else
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000611 v = PyUnicode_Decode(s, len, encoding, errors);
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +0000612
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000613 return v;
614
615 onError:
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000616 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000617}
618
619PyObject *PyUnicode_Decode(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000620 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +0000621 const char *encoding,
622 const char *errors)
623{
624 PyObject *buffer = NULL, *unicode;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000625
626 if (encoding == NULL)
Fred Drakee4315f52000-05-09 19:53:39 +0000627 encoding = PyUnicode_GetDefaultEncoding();
628
629 /* Shortcuts for common default encodings */
630 if (strcmp(encoding, "utf-8") == 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000631 return PyUnicode_DecodeUTF8(s, size, errors);
Fred Drakee4315f52000-05-09 19:53:39 +0000632 else if (strcmp(encoding, "latin-1") == 0)
633 return PyUnicode_DecodeLatin1(s, size, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +0000634#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
635 else if (strcmp(encoding, "mbcs") == 0)
636 return PyUnicode_DecodeMBCS(s, size, errors);
637#endif
Fred Drakee4315f52000-05-09 19:53:39 +0000638 else if (strcmp(encoding, "ascii") == 0)
639 return PyUnicode_DecodeASCII(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000640
641 /* Decode via the codec registry */
642 buffer = PyBuffer_FromMemory((void *)s, size);
643 if (buffer == NULL)
644 goto onError;
645 unicode = PyCodec_Decode(buffer, encoding, errors);
646 if (unicode == NULL)
647 goto onError;
648 if (!PyUnicode_Check(unicode)) {
649 PyErr_Format(PyExc_TypeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000650 "decoder did not return an unicode object (type=%.400s)",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000651 unicode->ob_type->tp_name);
652 Py_DECREF(unicode);
653 goto onError;
654 }
655 Py_DECREF(buffer);
656 return unicode;
Tim Petersced69f82003-09-16 20:30:58 +0000657
Guido van Rossumd57fd912000-03-10 22:53:23 +0000658 onError:
659 Py_XDECREF(buffer);
660 return NULL;
661}
662
Marc-André Lemburgd2d45982004-07-08 17:57:32 +0000663PyObject *PyUnicode_AsDecodedObject(PyObject *unicode,
664 const char *encoding,
665 const char *errors)
666{
667 PyObject *v;
668
669 if (!PyUnicode_Check(unicode)) {
670 PyErr_BadArgument();
671 goto onError;
672 }
673
674 if (encoding == NULL)
675 encoding = PyUnicode_GetDefaultEncoding();
676
677 /* Decode via the codec registry */
678 v = PyCodec_Decode(unicode, encoding, errors);
679 if (v == NULL)
680 goto onError;
681 return v;
682
683 onError:
684 return NULL;
685}
686
Guido van Rossumd57fd912000-03-10 22:53:23 +0000687PyObject *PyUnicode_Encode(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000688 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +0000689 const char *encoding,
690 const char *errors)
691{
692 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +0000693
Guido van Rossumd57fd912000-03-10 22:53:23 +0000694 unicode = PyUnicode_FromUnicode(s, size);
695 if (unicode == NULL)
696 return NULL;
697 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
698 Py_DECREF(unicode);
699 return v;
700}
701
Marc-André Lemburgd2d45982004-07-08 17:57:32 +0000702PyObject *PyUnicode_AsEncodedObject(PyObject *unicode,
703 const char *encoding,
704 const char *errors)
705{
706 PyObject *v;
707
708 if (!PyUnicode_Check(unicode)) {
709 PyErr_BadArgument();
710 goto onError;
711 }
712
713 if (encoding == NULL)
714 encoding = PyUnicode_GetDefaultEncoding();
715
716 /* Encode via the codec registry */
717 v = PyCodec_Encode(unicode, encoding, errors);
718 if (v == NULL)
719 goto onError;
720 return v;
721
722 onError:
723 return NULL;
724}
725
Guido van Rossumd57fd912000-03-10 22:53:23 +0000726PyObject *PyUnicode_AsEncodedString(PyObject *unicode,
727 const char *encoding,
728 const char *errors)
729{
730 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +0000731
Guido van Rossumd57fd912000-03-10 22:53:23 +0000732 if (!PyUnicode_Check(unicode)) {
733 PyErr_BadArgument();
734 goto onError;
735 }
Fred Drakee4315f52000-05-09 19:53:39 +0000736
Tim Petersced69f82003-09-16 20:30:58 +0000737 if (encoding == NULL)
Fred Drakee4315f52000-05-09 19:53:39 +0000738 encoding = PyUnicode_GetDefaultEncoding();
739
740 /* Shortcuts for common default encodings */
741 if (errors == NULL) {
742 if (strcmp(encoding, "utf-8") == 0)
Jeremy Hylton9cea41c2001-05-29 17:13:15 +0000743 return PyUnicode_AsUTF8String(unicode);
Fred Drakee4315f52000-05-09 19:53:39 +0000744 else if (strcmp(encoding, "latin-1") == 0)
745 return PyUnicode_AsLatin1String(unicode);
Mark Hammond0ccda1e2003-07-01 00:13:27 +0000746#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
747 else if (strcmp(encoding, "mbcs") == 0)
748 return PyUnicode_AsMBCSString(unicode);
749#endif
Fred Drakee4315f52000-05-09 19:53:39 +0000750 else if (strcmp(encoding, "ascii") == 0)
751 return PyUnicode_AsASCIIString(unicode);
752 }
Guido van Rossumd57fd912000-03-10 22:53:23 +0000753
754 /* Encode via the codec registry */
755 v = PyCodec_Encode(unicode, encoding, errors);
756 if (v == NULL)
757 goto onError;
Guido van Rossumf15a29f2007-05-04 00:41:39 +0000758 if (!PyBytes_Check(v)) {
759 if (PyString_Check(v)) {
760 /* Old codec, turn it into bytes */
761 PyObject *b = PyBytes_FromObject(v);
762 Py_DECREF(v);
763 return b;
764 }
Guido van Rossumd57fd912000-03-10 22:53:23 +0000765 PyErr_Format(PyExc_TypeError,
Guido van Rossumf15a29f2007-05-04 00:41:39 +0000766 "encoder did not return a bytes object "
767 "(type=%.400s, encoding=%.20s, errors=%.20s)",
768 v->ob_type->tp_name,
769 encoding ? encoding : "NULL",
770 errors ? errors : "NULL");
Guido van Rossumd57fd912000-03-10 22:53:23 +0000771 Py_DECREF(v);
772 goto onError;
773 }
774 return v;
Tim Petersced69f82003-09-16 20:30:58 +0000775
Guido van Rossumd57fd912000-03-10 22:53:23 +0000776 onError:
777 return NULL;
778}
779
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000780PyObject *_PyUnicode_AsDefaultEncodedString(PyObject *unicode,
781 const char *errors)
782{
783 PyObject *v = ((PyUnicodeObject *)unicode)->defenc;
Guido van Rossumf15a29f2007-05-04 00:41:39 +0000784 PyObject *b;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000785 if (v)
786 return v;
Guido van Rossumf15a29f2007-05-04 00:41:39 +0000787 if (errors != NULL)
788 Py_FatalError("non-NULL encoding in _PyUnicode_AsDefaultEncodedString");
789 if (errors == NULL) {
790 b = PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
791 PyUnicode_GET_SIZE(unicode),
792 NULL);
793 }
794 else {
795 b = PyUnicode_AsEncodedString(unicode, NULL, errors);
796 }
797 if (!b)
798 return NULL;
799 v = PyString_FromStringAndSize(PyBytes_AsString(b),
800 PyBytes_Size(b));
801 Py_DECREF(b);
802 if (!errors) {
803 Py_XINCREF(v);
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000804 ((PyUnicodeObject *)unicode)->defenc = v;
Guido van Rossumf15a29f2007-05-04 00:41:39 +0000805 }
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000806 return v;
807}
808
Guido van Rossumd57fd912000-03-10 22:53:23 +0000809Py_UNICODE *PyUnicode_AsUnicode(PyObject *unicode)
810{
811 if (!PyUnicode_Check(unicode)) {
812 PyErr_BadArgument();
813 goto onError;
814 }
815 return PyUnicode_AS_UNICODE(unicode);
816
817 onError:
818 return NULL;
819}
820
Martin v. Löwis18e16552006-02-15 17:27:45 +0000821Py_ssize_t PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000822{
823 if (!PyUnicode_Check(unicode)) {
824 PyErr_BadArgument();
825 goto onError;
826 }
827 return PyUnicode_GET_SIZE(unicode);
828
829 onError:
830 return -1;
831}
832
Thomas Wouters78890102000-07-22 19:25:51 +0000833const char *PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +0000834{
835 return unicode_default_encoding;
836}
837
838int PyUnicode_SetDefaultEncoding(const char *encoding)
839{
Guido van Rossumf15a29f2007-05-04 00:41:39 +0000840 if (strcmp(encoding, unicode_default_encoding) != 0) {
841 PyErr_Format(PyExc_ValueError,
842 "Can only set default encoding to %s",
843 unicode_default_encoding);
844 return -1;
845 }
Fred Drakee4315f52000-05-09 19:53:39 +0000846 return 0;
Fred Drakee4315f52000-05-09 19:53:39 +0000847}
848
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000849/* error handling callback helper:
850 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +0000851 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000852 and adjust various state variables.
853 return 0 on success, -1 on error
854*/
855
856static
857int unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
858 const char *encoding, const char *reason,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000859 const char *input, Py_ssize_t insize, Py_ssize_t *startinpos, Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
860 PyObject **output, Py_ssize_t *outpos, Py_UNICODE **outptr)
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000861{
Martin v. Löwis18e16552006-02-15 17:27:45 +0000862 static char *argparse = "O!n;decoding error handler must return (unicode, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000863
864 PyObject *restuple = NULL;
865 PyObject *repunicode = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +0000866 Py_ssize_t outsize = PyUnicode_GET_SIZE(*output);
867 Py_ssize_t requiredsize;
868 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000869 Py_UNICODE *repptr;
Martin v. Löwis18e16552006-02-15 17:27:45 +0000870 Py_ssize_t repsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000871 int res = -1;
872
873 if (*errorHandler == NULL) {
874 *errorHandler = PyCodec_LookupError(errors);
875 if (*errorHandler == NULL)
876 goto onError;
877 }
878
879 if (*exceptionObject == NULL) {
880 *exceptionObject = PyUnicodeDecodeError_Create(
881 encoding, input, insize, *startinpos, *endinpos, reason);
882 if (*exceptionObject == NULL)
883 goto onError;
884 }
885 else {
886 if (PyUnicodeDecodeError_SetStart(*exceptionObject, *startinpos))
887 goto onError;
888 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, *endinpos))
889 goto onError;
890 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
891 goto onError;
892 }
893
894 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
895 if (restuple == NULL)
896 goto onError;
897 if (!PyTuple_Check(restuple)) {
898 PyErr_Format(PyExc_TypeError, &argparse[4]);
899 goto onError;
900 }
901 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
902 goto onError;
903 if (newpos<0)
Walter Dörwald2e0b18a2003-01-31 17:19:08 +0000904 newpos = insize+newpos;
905 if (newpos<0 || newpos>insize) {
Martin v. Löwis2c95cc62006-02-16 06:54:25 +0000906 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
Walter Dörwald2e0b18a2003-01-31 17:19:08 +0000907 goto onError;
908 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000909
910 /* need more space? (at least enough for what we
911 have+the replacement+the rest of the string (starting
912 at the new input position), so we won't have to check space
913 when there are no errors in the rest of the string) */
914 repptr = PyUnicode_AS_UNICODE(repunicode);
915 repsize = PyUnicode_GET_SIZE(repunicode);
916 requiredsize = *outpos + repsize + insize-newpos;
917 if (requiredsize > outsize) {
918 if (requiredsize<2*outsize)
919 requiredsize = 2*outsize;
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +0000920 if (PyUnicode_Resize(output, requiredsize) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000921 goto onError;
922 *outptr = PyUnicode_AS_UNICODE(*output) + *outpos;
923 }
924 *endinpos = newpos;
925 *inptr = input + newpos;
926 Py_UNICODE_COPY(*outptr, repptr, repsize);
927 *outptr += repsize;
928 *outpos += repsize;
929 /* we made it! */
930 res = 0;
931
932 onError:
933 Py_XDECREF(restuple);
934 return res;
935}
936
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000937/* --- UTF-7 Codec -------------------------------------------------------- */
938
939/* see RFC2152 for details */
940
Tim Petersced69f82003-09-16 20:30:58 +0000941static
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000942char utf7_special[128] = {
943 /* indicate whether a UTF-7 character is special i.e. cannot be directly
944 encoded:
945 0 - not special
946 1 - special
947 2 - whitespace (optional)
948 3 - RFC2152 Set O (optional) */
949 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 2, 1, 1,
950 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
951 2, 3, 3, 3, 3, 3, 3, 0, 0, 0, 3, 1, 0, 0, 0, 1,
952 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 0,
953 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
954 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 1, 3, 3, 3,
955 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
956 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 1, 1,
957
958};
959
Marc-André Lemburge115ec82005-10-19 22:33:31 +0000960/* Note: The comparison (c) <= 0 is a trick to work-around gcc
961 warnings about the comparison always being false; since
962 utf7_special[0] is 1, we can safely make that one comparison
963 true */
964
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000965#define SPECIAL(c, encodeO, encodeWS) \
Marc-André Lemburge115ec82005-10-19 22:33:31 +0000966 ((c) > 127 || (c) <= 0 || utf7_special[(c)] == 1 || \
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +0000967 (encodeWS && (utf7_special[(c)] == 2)) || \
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000968 (encodeO && (utf7_special[(c)] == 3)))
969
Marc-André Lemburge115ec82005-10-19 22:33:31 +0000970#define B64(n) \
971 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
972#define B64CHAR(c) \
973 (isalnum(c) || (c) == '+' || (c) == '/')
974#define UB64(c) \
975 ((c) == '+' ? 62 : (c) == '/' ? 63 : (c) >= 'a' ? \
976 (c) - 71 : (c) >= 'A' ? (c) - 65 : (c) + 4 )
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000977
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +0000978#define ENCODE(out, ch, bits) \
979 while (bits >= 6) { \
980 *out++ = B64(ch >> (bits-6)); \
981 bits -= 6; \
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000982 }
983
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +0000984#define DECODE(out, ch, bits, surrogate) \
985 while (bits >= 16) { \
986 Py_UNICODE outCh = (Py_UNICODE) ((ch >> (bits-16)) & 0xffff); \
987 bits -= 16; \
988 if (surrogate) { \
Marc-André Lemburge115ec82005-10-19 22:33:31 +0000989 /* We have already generated an error for the high surrogate \
990 so let's not bother seeing if the low surrogate is correct or not */ \
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +0000991 surrogate = 0; \
992 } else if (0xDC00 <= outCh && outCh <= 0xDFFF) { \
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000993 /* This is a surrogate pair. Unfortunately we can't represent \
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +0000994 it in a 16-bit character */ \
995 surrogate = 1; \
996 errmsg = "code pairs are not supported"; \
997 goto utf7Error; \
998 } else { \
999 *out++ = outCh; \
1000 } \
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001001 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001002
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001003PyObject *PyUnicode_DecodeUTF7(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001004 Py_ssize_t size,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001005 const char *errors)
1006{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001007 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001008 Py_ssize_t startinpos;
1009 Py_ssize_t endinpos;
1010 Py_ssize_t outpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001011 const char *e;
1012 PyUnicodeObject *unicode;
1013 Py_UNICODE *p;
1014 const char *errmsg = "";
1015 int inShift = 0;
1016 unsigned int bitsleft = 0;
1017 unsigned long charsleft = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001018 int surrogate = 0;
1019 PyObject *errorHandler = NULL;
1020 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001021
1022 unicode = _PyUnicode_New(size);
1023 if (!unicode)
1024 return NULL;
1025 if (size == 0)
1026 return (PyObject *)unicode;
1027
1028 p = unicode->str;
1029 e = s + size;
1030
1031 while (s < e) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001032 Py_UNICODE ch;
1033 restart:
1034 ch = *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001035
1036 if (inShift) {
1037 if ((ch == '-') || !B64CHAR(ch)) {
1038 inShift = 0;
1039 s++;
Tim Petersced69f82003-09-16 20:30:58 +00001040
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001041 /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
1042 if (bitsleft >= 6) {
1043 /* The shift sequence has a partial character in it. If
1044 bitsleft < 6 then we could just classify it as padding
1045 but that is not the case here */
1046
1047 errmsg = "partial character in shift sequence";
Tim Petersced69f82003-09-16 20:30:58 +00001048 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001049 }
1050 /* According to RFC2152 the remaining bits should be zero. We
Tim Petersced69f82003-09-16 20:30:58 +00001051 choose to signal an error/insert a replacement character
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001052 here so indicate the potential of a misencoded character. */
1053
1054 /* On x86, a << b == a << (b%32) so make sure that bitsleft != 0 */
1055 if (bitsleft && charsleft << (sizeof(charsleft) * 8 - bitsleft)) {
1056 errmsg = "non-zero padding bits in shift sequence";
Tim Petersced69f82003-09-16 20:30:58 +00001057 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001058 }
1059
1060 if (ch == '-') {
1061 if ((s < e) && (*(s) == '-')) {
Tim Petersced69f82003-09-16 20:30:58 +00001062 *p++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001063 inShift = 1;
1064 }
1065 } else if (SPECIAL(ch,0,0)) {
1066 errmsg = "unexpected special character";
Tim Petersced69f82003-09-16 20:30:58 +00001067 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001068 } else {
1069 *p++ = ch;
1070 }
1071 } else {
1072 charsleft = (charsleft << 6) | UB64(ch);
1073 bitsleft += 6;
1074 s++;
1075 /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
1076 }
1077 }
1078 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001079 startinpos = s-starts;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001080 s++;
1081 if (s < e && *s == '-') {
1082 s++;
1083 *p++ = '+';
1084 } else
1085 {
1086 inShift = 1;
1087 bitsleft = 0;
1088 }
1089 }
1090 else if (SPECIAL(ch,0,0)) {
1091 errmsg = "unexpected special character";
1092 s++;
Tim Petersced69f82003-09-16 20:30:58 +00001093 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001094 }
1095 else {
1096 *p++ = ch;
1097 s++;
1098 }
1099 continue;
1100 utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001101 outpos = p-PyUnicode_AS_UNICODE(unicode);
1102 endinpos = s-starts;
1103 if (unicode_decode_call_errorhandler(
1104 errors, &errorHandler,
1105 "utf7", errmsg,
1106 starts, size, &startinpos, &endinpos, &exc, &s,
1107 (PyObject **)&unicode, &outpos, &p))
1108 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001109 }
1110
1111 if (inShift) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001112 outpos = p-PyUnicode_AS_UNICODE(unicode);
1113 endinpos = size;
1114 if (unicode_decode_call_errorhandler(
1115 errors, &errorHandler,
1116 "utf7", "unterminated shift sequence",
1117 starts, size, &startinpos, &endinpos, &exc, &s,
1118 (PyObject **)&unicode, &outpos, &p))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001119 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001120 if (s < e)
1121 goto restart;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001122 }
1123
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00001124 if (_PyUnicode_Resize(&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001125 goto onError;
1126
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001127 Py_XDECREF(errorHandler);
1128 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001129 return (PyObject *)unicode;
1130
1131onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001132 Py_XDECREF(errorHandler);
1133 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001134 Py_DECREF(unicode);
1135 return NULL;
1136}
1137
1138
1139PyObject *PyUnicode_EncodeUTF7(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001140 Py_ssize_t size,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001141 int encodeSetO,
1142 int encodeWhiteSpace,
1143 const char *errors)
1144{
1145 PyObject *v;
1146 /* It might be possible to tighten this worst case */
Martin v. Löwis18e16552006-02-15 17:27:45 +00001147 Py_ssize_t cbAllocated = 5 * size;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001148 int inShift = 0;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001149 Py_ssize_t i = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001150 unsigned int bitsleft = 0;
1151 unsigned long charsleft = 0;
1152 char * out;
1153 char * start;
1154
1155 if (size == 0)
Walter Dörwald51ab4142007-05-05 14:43:36 +00001156 return PyBytes_FromStringAndSize(NULL, 0);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001157
Walter Dörwald51ab4142007-05-05 14:43:36 +00001158 v = PyBytes_FromStringAndSize(NULL, cbAllocated);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001159 if (v == NULL)
1160 return NULL;
1161
Walter Dörwald51ab4142007-05-05 14:43:36 +00001162 start = out = PyBytes_AS_STRING(v);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001163 for (;i < size; ++i) {
1164 Py_UNICODE ch = s[i];
1165
1166 if (!inShift) {
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001167 if (ch == '+') {
1168 *out++ = '+';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001169 *out++ = '-';
1170 } else if (SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
1171 charsleft = ch;
1172 bitsleft = 16;
1173 *out++ = '+';
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001174 /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001175 inShift = bitsleft > 0;
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001176 } else {
1177 *out++ = (char) ch;
1178 }
1179 } else {
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001180 if (!SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
1181 *out++ = B64(charsleft << (6-bitsleft));
1182 charsleft = 0;
1183 bitsleft = 0;
1184 /* Characters not in the BASE64 set implicitly unshift the sequence
1185 so no '-' is required, except if the character is itself a '-' */
1186 if (B64CHAR(ch) || ch == '-') {
1187 *out++ = '-';
1188 }
1189 inShift = 0;
1190 *out++ = (char) ch;
1191 } else {
1192 bitsleft += 16;
1193 charsleft = (charsleft << 16) | ch;
1194 /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
1195
1196 /* If the next character is special then we dont' need to terminate
Tim Petersced69f82003-09-16 20:30:58 +00001197 the shift sequence. If the next character is not a BASE64 character
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001198 or '-' then the shift sequence will be terminated implicitly and we
1199 don't have to insert a '-'. */
1200
1201 if (bitsleft == 0) {
1202 if (i + 1 < size) {
1203 Py_UNICODE ch2 = s[i+1];
1204
1205 if (SPECIAL(ch2, encodeSetO, encodeWhiteSpace)) {
Tim Petersced69f82003-09-16 20:30:58 +00001206
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001207 } else if (B64CHAR(ch2) || ch2 == '-') {
1208 *out++ = '-';
1209 inShift = 0;
1210 } else {
1211 inShift = 0;
1212 }
1213
1214 }
1215 else {
1216 *out++ = '-';
1217 inShift = 0;
1218 }
1219 }
Tim Petersced69f82003-09-16 20:30:58 +00001220 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001221 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001222 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001223 if (bitsleft) {
1224 *out++= B64(charsleft << (6-bitsleft) );
1225 *out++ = '-';
1226 }
1227
Walter Dörwald51ab4142007-05-05 14:43:36 +00001228 if (PyBytes_Resize(v, out - start)) {
1229 Py_DECREF(v);
1230 return NULL;
1231 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001232 return v;
1233}
1234
1235#undef SPECIAL
1236#undef B64
1237#undef B64CHAR
1238#undef UB64
1239#undef ENCODE
1240#undef DECODE
1241
Guido van Rossumd57fd912000-03-10 22:53:23 +00001242/* --- UTF-8 Codec -------------------------------------------------------- */
1243
Tim Petersced69f82003-09-16 20:30:58 +00001244static
Guido van Rossumd57fd912000-03-10 22:53:23 +00001245char utf8_code_length[256] = {
1246 /* Map UTF-8 encoded prefix byte to sequence length. zero means
1247 illegal prefix. see RFC 2279 for details */
1248 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1249 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1250 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1251 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1252 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1253 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1254 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1255 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1256 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1257 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1258 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1259 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1260 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1261 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1262 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
1263 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0
1264};
1265
Guido van Rossumd57fd912000-03-10 22:53:23 +00001266PyObject *PyUnicode_DecodeUTF8(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001267 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00001268 const char *errors)
1269{
Walter Dörwald69652032004-09-07 20:24:22 +00001270 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
1271}
1272
1273PyObject *PyUnicode_DecodeUTF8Stateful(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001274 Py_ssize_t size,
Walter Dörwald69652032004-09-07 20:24:22 +00001275 const char *errors,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001276 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00001277{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001278 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001279 int n;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001280 Py_ssize_t startinpos;
1281 Py_ssize_t endinpos;
1282 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001283 const char *e;
1284 PyUnicodeObject *unicode;
1285 Py_UNICODE *p;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001286 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001287 PyObject *errorHandler = NULL;
1288 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001289
1290 /* Note: size will always be longer than the resulting Unicode
1291 character count */
1292 unicode = _PyUnicode_New(size);
1293 if (!unicode)
1294 return NULL;
Walter Dörwald69652032004-09-07 20:24:22 +00001295 if (size == 0) {
1296 if (consumed)
1297 *consumed = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001298 return (PyObject *)unicode;
Walter Dörwald69652032004-09-07 20:24:22 +00001299 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001300
1301 /* Unpack UTF-8 encoded data */
1302 p = unicode->str;
1303 e = s + size;
1304
1305 while (s < e) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001306 Py_UCS4 ch = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001307
1308 if (ch < 0x80) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001309 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001310 s++;
1311 continue;
1312 }
1313
1314 n = utf8_code_length[ch];
1315
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001316 if (s + n > e) {
Walter Dörwald69652032004-09-07 20:24:22 +00001317 if (consumed)
1318 break;
1319 else {
1320 errmsg = "unexpected end of data";
1321 startinpos = s-starts;
1322 endinpos = size;
1323 goto utf8Error;
1324 }
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001325 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001326
1327 switch (n) {
1328
1329 case 0:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001330 errmsg = "unexpected code byte";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001331 startinpos = s-starts;
1332 endinpos = startinpos+1;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001333 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001334
1335 case 1:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001336 errmsg = "internal error";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001337 startinpos = s-starts;
1338 endinpos = startinpos+1;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001339 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001340
1341 case 2:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001342 if ((s[1] & 0xc0) != 0x80) {
1343 errmsg = "invalid data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001344 startinpos = s-starts;
1345 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001346 goto utf8Error;
1347 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001348 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001349 if (ch < 0x80) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001350 startinpos = s-starts;
1351 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001352 errmsg = "illegal encoding";
1353 goto utf8Error;
1354 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001355 else
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001356 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001357 break;
1358
1359 case 3:
Tim Petersced69f82003-09-16 20:30:58 +00001360 if ((s[1] & 0xc0) != 0x80 ||
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001361 (s[2] & 0xc0) != 0x80) {
1362 errmsg = "invalid data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001363 startinpos = s-starts;
1364 endinpos = startinpos+3;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001365 goto utf8Error;
1366 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001367 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001368 if (ch < 0x0800) {
1369 /* Note: UTF-8 encodings of surrogates are considered
Tim Petersced69f82003-09-16 20:30:58 +00001370 legal UTF-8 sequences;
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001371
1372 XXX For wide builds (UCS-4) we should probably try
1373 to recombine the surrogates into a single code
1374 unit.
1375 */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001376 errmsg = "illegal encoding";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001377 startinpos = s-starts;
1378 endinpos = startinpos+3;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001379 goto utf8Error;
1380 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001381 else
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001382 *p++ = (Py_UNICODE)ch;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001383 break;
1384
1385 case 4:
1386 if ((s[1] & 0xc0) != 0x80 ||
1387 (s[2] & 0xc0) != 0x80 ||
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001388 (s[3] & 0xc0) != 0x80) {
1389 errmsg = "invalid data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001390 startinpos = s-starts;
1391 endinpos = startinpos+4;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001392 goto utf8Error;
1393 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001394 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
1395 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
1396 /* validate and convert to UTF-16 */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001397 if ((ch < 0x10000) /* minimum value allowed for 4
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001398 byte encoding */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001399 || (ch > 0x10ffff)) /* maximum value allowed for
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001400 UTF-16 */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001401 {
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001402 errmsg = "illegal encoding";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001403 startinpos = s-starts;
1404 endinpos = startinpos+4;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001405 goto utf8Error;
1406 }
Fredrik Lundh8f455852001-06-27 18:59:43 +00001407#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001408 *p++ = (Py_UNICODE)ch;
1409#else
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001410 /* compute and append the two surrogates: */
Tim Petersced69f82003-09-16 20:30:58 +00001411
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001412 /* translate from 10000..10FFFF to 0..FFFF */
1413 ch -= 0x10000;
Tim Petersced69f82003-09-16 20:30:58 +00001414
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001415 /* high surrogate = top 10 bits added to D800 */
1416 *p++ = (Py_UNICODE)(0xD800 + (ch >> 10));
Tim Petersced69f82003-09-16 20:30:58 +00001417
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001418 /* low surrogate = bottom 10 bits added to DC00 */
Fredrik Lundh45714e92001-06-26 16:39:36 +00001419 *p++ = (Py_UNICODE)(0xDC00 + (ch & 0x03FF));
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001420#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00001421 break;
1422
1423 default:
1424 /* Other sizes are only needed for UCS-4 */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001425 errmsg = "unsupported Unicode code range";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001426 startinpos = s-starts;
1427 endinpos = startinpos+n;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001428 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001429 }
1430 s += n;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001431 continue;
Tim Petersced69f82003-09-16 20:30:58 +00001432
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001433 utf8Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001434 outpos = p-PyUnicode_AS_UNICODE(unicode);
1435 if (unicode_decode_call_errorhandler(
1436 errors, &errorHandler,
1437 "utf8", errmsg,
1438 starts, size, &startinpos, &endinpos, &exc, &s,
1439 (PyObject **)&unicode, &outpos, &p))
1440 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001441 }
Walter Dörwald69652032004-09-07 20:24:22 +00001442 if (consumed)
1443 *consumed = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001444
1445 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00001446 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001447 goto onError;
1448
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001449 Py_XDECREF(errorHandler);
1450 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001451 return (PyObject *)unicode;
1452
1453onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001454 Py_XDECREF(errorHandler);
1455 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001456 Py_DECREF(unicode);
1457 return NULL;
1458}
1459
Tim Peters602f7402002-04-27 18:03:26 +00001460/* Allocation strategy: if the string is short, convert into a stack buffer
1461 and allocate exactly as much space needed at the end. Else allocate the
1462 maximum possible needed (4 result bytes per Unicode character), and return
1463 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001464*/
Tim Peters7e3d9612002-04-21 03:26:37 +00001465PyObject *
1466PyUnicode_EncodeUTF8(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001467 Py_ssize_t size,
Tim Peters7e3d9612002-04-21 03:26:37 +00001468 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001469{
Tim Peters602f7402002-04-27 18:03:26 +00001470#define MAX_SHORT_UNICHARS 300 /* largest size we'll do on the stack */
Tim Peters0eca65c2002-04-21 17:28:06 +00001471
Martin v. Löwis18e16552006-02-15 17:27:45 +00001472 Py_ssize_t i; /* index into s of next input byte */
Tim Peters602f7402002-04-27 18:03:26 +00001473 PyObject *v; /* result string object */
1474 char *p; /* next free byte in output buffer */
Martin v. Löwis18e16552006-02-15 17:27:45 +00001475 Py_ssize_t nallocated; /* number of result bytes allocated */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001476 Py_ssize_t nneeded; /* number of result bytes needed */
Tim Peters602f7402002-04-27 18:03:26 +00001477 char stackbuf[MAX_SHORT_UNICHARS * 4];
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001478
Tim Peters602f7402002-04-27 18:03:26 +00001479 assert(s != NULL);
1480 assert(size >= 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001481
Tim Peters602f7402002-04-27 18:03:26 +00001482 if (size <= MAX_SHORT_UNICHARS) {
1483 /* Write into the stack buffer; nallocated can't overflow.
1484 * At the end, we'll allocate exactly as much heap space as it
1485 * turns out we need.
1486 */
1487 nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int);
1488 v = NULL; /* will allocate after we're done */
1489 p = stackbuf;
1490 }
1491 else {
1492 /* Overallocate on the heap, and give the excess back at the end. */
1493 nallocated = size * 4;
1494 if (nallocated / 4 != size) /* overflow! */
1495 return PyErr_NoMemory();
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001496 v = PyBytes_FromStringAndSize(NULL, nallocated);
Tim Peters602f7402002-04-27 18:03:26 +00001497 if (v == NULL)
1498 return NULL;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001499 p = PyBytes_AS_STRING(v);
Tim Peters602f7402002-04-27 18:03:26 +00001500 }
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001501
Tim Peters602f7402002-04-27 18:03:26 +00001502 for (i = 0; i < size;) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001503 Py_UCS4 ch = s[i++];
Marc-André Lemburg3688a882002-02-06 18:09:02 +00001504
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001505 if (ch < 0x80)
Tim Peters602f7402002-04-27 18:03:26 +00001506 /* Encode ASCII */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001507 *p++ = (char) ch;
Marc-André Lemburg3688a882002-02-06 18:09:02 +00001508
Guido van Rossumd57fd912000-03-10 22:53:23 +00001509 else if (ch < 0x0800) {
Tim Peters602f7402002-04-27 18:03:26 +00001510 /* Encode Latin-1 */
Marc-André Lemburgdc724d62002-02-06 18:20:19 +00001511 *p++ = (char)(0xc0 | (ch >> 6));
1512 *p++ = (char)(0x80 | (ch & 0x3f));
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001513 }
Marc-André Lemburg3688a882002-02-06 18:09:02 +00001514 else {
Tim Peters602f7402002-04-27 18:03:26 +00001515 /* Encode UCS2 Unicode ordinals */
1516 if (ch < 0x10000) {
1517 /* Special case: check for high surrogate */
1518 if (0xD800 <= ch && ch <= 0xDBFF && i != size) {
1519 Py_UCS4 ch2 = s[i];
1520 /* Check for low surrogate and combine the two to
1521 form a UCS4 value */
1522 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001523 ch = ((ch - 0xD800) << 10 | (ch2 - 0xDC00)) + 0x10000;
Tim Peters602f7402002-04-27 18:03:26 +00001524 i++;
1525 goto encodeUCS4;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001526 }
Tim Peters602f7402002-04-27 18:03:26 +00001527 /* Fall through: handles isolated high surrogates */
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001528 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001529 *p++ = (char)(0xe0 | (ch >> 12));
Tim Peters602f7402002-04-27 18:03:26 +00001530 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
1531 *p++ = (char)(0x80 | (ch & 0x3f));
1532 continue;
1533 }
1534encodeUCS4:
1535 /* Encode UCS4 Unicode ordinals */
1536 *p++ = (char)(0xf0 | (ch >> 18));
1537 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
1538 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
1539 *p++ = (char)(0x80 | (ch & 0x3f));
1540 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001541 }
Tim Peters0eca65c2002-04-21 17:28:06 +00001542
Tim Peters602f7402002-04-27 18:03:26 +00001543 if (v == NULL) {
1544 /* This was stack allocated. */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001545 nneeded = p - stackbuf;
Tim Peters602f7402002-04-27 18:03:26 +00001546 assert(nneeded <= nallocated);
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001547 v = PyBytes_FromStringAndSize(stackbuf, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00001548 }
1549 else {
1550 /* Cut back to size actually needed. */
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001551 nneeded = p - PyBytes_AS_STRING(v);
Tim Peters602f7402002-04-27 18:03:26 +00001552 assert(nneeded <= nallocated);
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001553 PyBytes_Resize(v, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00001554 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001555 return v;
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001556
Tim Peters602f7402002-04-27 18:03:26 +00001557#undef MAX_SHORT_UNICHARS
Guido van Rossumd57fd912000-03-10 22:53:23 +00001558}
1559
Guido van Rossumd57fd912000-03-10 22:53:23 +00001560PyObject *PyUnicode_AsUTF8String(PyObject *unicode)
1561{
Guido van Rossumd57fd912000-03-10 22:53:23 +00001562 if (!PyUnicode_Check(unicode)) {
1563 PyErr_BadArgument();
1564 return NULL;
1565 }
Barry Warsaw2dd4abf2000-08-18 06:58:15 +00001566 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
1567 PyUnicode_GET_SIZE(unicode),
1568 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001569}
1570
1571/* --- UTF-16 Codec ------------------------------------------------------- */
1572
Tim Peters772747b2001-08-09 22:21:55 +00001573PyObject *
1574PyUnicode_DecodeUTF16(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001575 Py_ssize_t size,
Tim Peters772747b2001-08-09 22:21:55 +00001576 const char *errors,
1577 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001578{
Walter Dörwald69652032004-09-07 20:24:22 +00001579 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
1580}
1581
1582PyObject *
1583PyUnicode_DecodeUTF16Stateful(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001584 Py_ssize_t size,
Walter Dörwald69652032004-09-07 20:24:22 +00001585 const char *errors,
1586 int *byteorder,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001587 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00001588{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001589 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001590 Py_ssize_t startinpos;
1591 Py_ssize_t endinpos;
1592 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001593 PyUnicodeObject *unicode;
1594 Py_UNICODE *p;
Tim Peters772747b2001-08-09 22:21:55 +00001595 const unsigned char *q, *e;
1596 int bo = 0; /* assume native ordering by default */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001597 const char *errmsg = "";
Tim Peters772747b2001-08-09 22:21:55 +00001598 /* Offsets from q for retrieving byte pairs in the right order. */
1599#ifdef BYTEORDER_IS_LITTLE_ENDIAN
1600 int ihi = 1, ilo = 0;
1601#else
1602 int ihi = 0, ilo = 1;
1603#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001604 PyObject *errorHandler = NULL;
1605 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001606
1607 /* Note: size will always be longer than the resulting Unicode
1608 character count */
1609 unicode = _PyUnicode_New(size);
1610 if (!unicode)
1611 return NULL;
1612 if (size == 0)
1613 return (PyObject *)unicode;
1614
1615 /* Unpack UTF-16 encoded data */
1616 p = unicode->str;
Tim Peters772747b2001-08-09 22:21:55 +00001617 q = (unsigned char *)s;
1618 e = q + size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001619
1620 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00001621 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001622
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001623 /* Check for BOM marks (U+FEFF) in the input and adjust current
1624 byte order setting accordingly. In native mode, the leading BOM
1625 mark is skipped, in all other modes, it is copied to the output
1626 stream as-is (giving a ZWNBSP character). */
1627 if (bo == 0) {
Walter Dörwald69652032004-09-07 20:24:22 +00001628 if (size >= 2) {
1629 const Py_UNICODE bom = (q[ihi] << 8) | q[ilo];
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001630#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Walter Dörwald69652032004-09-07 20:24:22 +00001631 if (bom == 0xFEFF) {
1632 q += 2;
1633 bo = -1;
1634 }
1635 else if (bom == 0xFFFE) {
1636 q += 2;
1637 bo = 1;
1638 }
Tim Petersced69f82003-09-16 20:30:58 +00001639#else
Walter Dörwald69652032004-09-07 20:24:22 +00001640 if (bom == 0xFEFF) {
1641 q += 2;
1642 bo = 1;
1643 }
1644 else if (bom == 0xFFFE) {
1645 q += 2;
1646 bo = -1;
1647 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001648#endif
Walter Dörwald69652032004-09-07 20:24:22 +00001649 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001650 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001651
Tim Peters772747b2001-08-09 22:21:55 +00001652 if (bo == -1) {
1653 /* force LE */
1654 ihi = 1;
1655 ilo = 0;
1656 }
1657 else if (bo == 1) {
1658 /* force BE */
1659 ihi = 0;
1660 ilo = 1;
1661 }
1662
1663 while (q < e) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001664 Py_UNICODE ch;
Walter Dörwald69652032004-09-07 20:24:22 +00001665 /* remaining bytes at the end? (size should be even) */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001666 if (e-q<2) {
Walter Dörwald69652032004-09-07 20:24:22 +00001667 if (consumed)
1668 break;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001669 errmsg = "truncated data";
1670 startinpos = ((const char *)q)-starts;
1671 endinpos = ((const char *)e)-starts;
1672 goto utf16Error;
1673 /* The remaining input chars are ignored if the callback
1674 chooses to skip the input */
1675 }
1676 ch = (q[ihi] << 8) | q[ilo];
1677
Tim Peters772747b2001-08-09 22:21:55 +00001678 q += 2;
1679
Guido van Rossumd57fd912000-03-10 22:53:23 +00001680 if (ch < 0xD800 || ch > 0xDFFF) {
1681 *p++ = ch;
1682 continue;
1683 }
1684
1685 /* UTF-16 code pair: */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001686 if (q >= e) {
1687 errmsg = "unexpected end of data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001688 startinpos = (((const char *)q)-2)-starts;
1689 endinpos = ((const char *)e)-starts;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001690 goto utf16Error;
1691 }
Martin v. Löwisac93bc22001-06-26 22:43:40 +00001692 if (0xD800 <= ch && ch <= 0xDBFF) {
Tim Peters772747b2001-08-09 22:21:55 +00001693 Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo];
1694 q += 2;
Martin v. Löwisac93bc22001-06-26 22:43:40 +00001695 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Fredrik Lundh8f455852001-06-27 18:59:43 +00001696#ifndef Py_UNICODE_WIDE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001697 *p++ = ch;
1698 *p++ = ch2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001699#else
1700 *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001701#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001702 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001703 }
1704 else {
1705 errmsg = "illegal UTF-16 surrogate";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001706 startinpos = (((const char *)q)-4)-starts;
1707 endinpos = startinpos+2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001708 goto utf16Error;
1709 }
1710
Guido van Rossumd57fd912000-03-10 22:53:23 +00001711 }
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001712 errmsg = "illegal encoding";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001713 startinpos = (((const char *)q)-2)-starts;
1714 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001715 /* Fall through to report the error */
1716
1717 utf16Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001718 outpos = p-PyUnicode_AS_UNICODE(unicode);
1719 if (unicode_decode_call_errorhandler(
1720 errors, &errorHandler,
1721 "utf16", errmsg,
1722 starts, size, &startinpos, &endinpos, &exc, (const char **)&q,
1723 (PyObject **)&unicode, &outpos, &p))
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001724 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001725 }
1726
1727 if (byteorder)
1728 *byteorder = bo;
1729
Walter Dörwald69652032004-09-07 20:24:22 +00001730 if (consumed)
1731 *consumed = (const char *)q-starts;
1732
Guido van Rossumd57fd912000-03-10 22:53:23 +00001733 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00001734 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001735 goto onError;
1736
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001737 Py_XDECREF(errorHandler);
1738 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001739 return (PyObject *)unicode;
1740
1741onError:
1742 Py_DECREF(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001743 Py_XDECREF(errorHandler);
1744 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001745 return NULL;
1746}
1747
Tim Peters772747b2001-08-09 22:21:55 +00001748PyObject *
1749PyUnicode_EncodeUTF16(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001750 Py_ssize_t size,
Tim Peters772747b2001-08-09 22:21:55 +00001751 const char *errors,
1752 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001753{
1754 PyObject *v;
Tim Peters772747b2001-08-09 22:21:55 +00001755 unsigned char *p;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00001756#ifdef Py_UNICODE_WIDE
Tim Peters772747b2001-08-09 22:21:55 +00001757 int i, pairs;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00001758#else
1759 const int pairs = 0;
1760#endif
Tim Peters772747b2001-08-09 22:21:55 +00001761 /* Offsets from p for storing byte pairs in the right order. */
1762#ifdef BYTEORDER_IS_LITTLE_ENDIAN
1763 int ihi = 1, ilo = 0;
1764#else
1765 int ihi = 0, ilo = 1;
1766#endif
1767
1768#define STORECHAR(CH) \
1769 do { \
1770 p[ihi] = ((CH) >> 8) & 0xff; \
1771 p[ilo] = (CH) & 0xff; \
1772 p += 2; \
1773 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001774
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00001775#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001776 for (i = pairs = 0; i < size; i++)
1777 if (s[i] >= 0x10000)
1778 pairs++;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00001779#endif
Walter Dörwald3cc34522007-05-04 10:48:27 +00001780 v = PyBytes_FromStringAndSize(NULL,
Tim Peters772747b2001-08-09 22:21:55 +00001781 2 * (size + pairs + (byteorder == 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +00001782 if (v == NULL)
1783 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001784
Walter Dörwald3cc34522007-05-04 10:48:27 +00001785 p = (unsigned char *)PyBytes_AS_STRING(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001786 if (byteorder == 0)
Tim Peters772747b2001-08-09 22:21:55 +00001787 STORECHAR(0xFEFF);
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00001788 if (size == 0)
Marc-André Lemburgb7520772000-08-14 11:29:19 +00001789 return v;
Tim Peters772747b2001-08-09 22:21:55 +00001790
1791 if (byteorder == -1) {
1792 /* force LE */
1793 ihi = 1;
1794 ilo = 0;
1795 }
1796 else if (byteorder == 1) {
1797 /* force BE */
1798 ihi = 0;
1799 ilo = 1;
1800 }
1801
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001802 while (size-- > 0) {
1803 Py_UNICODE ch = *s++;
1804 Py_UNICODE ch2 = 0;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00001805#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001806 if (ch >= 0x10000) {
Tim Peters772747b2001-08-09 22:21:55 +00001807 ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF);
1808 ch = 0xD800 | ((ch-0x10000) >> 10);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001809 }
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00001810#endif
Tim Peters772747b2001-08-09 22:21:55 +00001811 STORECHAR(ch);
1812 if (ch2)
1813 STORECHAR(ch2);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001814 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001815 return v;
Tim Peters772747b2001-08-09 22:21:55 +00001816#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00001817}
1818
1819PyObject *PyUnicode_AsUTF16String(PyObject *unicode)
1820{
1821 if (!PyUnicode_Check(unicode)) {
1822 PyErr_BadArgument();
1823 return NULL;
1824 }
1825 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
1826 PyUnicode_GET_SIZE(unicode),
1827 NULL,
1828 0);
1829}
1830
1831/* --- Unicode Escape Codec ----------------------------------------------- */
1832
Fredrik Lundh06d12682001-01-24 07:59:11 +00001833static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00001834
Guido van Rossumd57fd912000-03-10 22:53:23 +00001835PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001836 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00001837 const char *errors)
1838{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001839 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001840 Py_ssize_t startinpos;
1841 Py_ssize_t endinpos;
1842 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001843 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001844 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001845 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001846 const char *end;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001847 char* message;
1848 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001849 PyObject *errorHandler = NULL;
1850 PyObject *exc = NULL;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001851
Guido van Rossumd57fd912000-03-10 22:53:23 +00001852 /* Escaped strings will always be longer than the resulting
1853 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001854 length after conversion to the true value.
1855 (but if the error callback returns a long replacement string
1856 we'll have to allocate more space) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001857 v = _PyUnicode_New(size);
1858 if (v == NULL)
1859 goto onError;
1860 if (size == 0)
1861 return (PyObject *)v;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001862
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001863 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001864 end = s + size;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001865
Guido van Rossumd57fd912000-03-10 22:53:23 +00001866 while (s < end) {
1867 unsigned char c;
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00001868 Py_UNICODE x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001869 int digits;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001870
1871 /* Non-escape characters are interpreted as Unicode ordinals */
1872 if (*s != '\\') {
Fredrik Lundhccc74732001-02-18 22:13:49 +00001873 *p++ = (unsigned char) *s++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001874 continue;
1875 }
1876
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001877 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001878 /* \ - Escapes */
1879 s++;
1880 switch (*s++) {
1881
1882 /* \x escapes */
1883 case '\n': break;
1884 case '\\': *p++ = '\\'; break;
1885 case '\'': *p++ = '\''; break;
1886 case '\"': *p++ = '\"'; break;
1887 case 'b': *p++ = '\b'; break;
1888 case 'f': *p++ = '\014'; break; /* FF */
1889 case 't': *p++ = '\t'; break;
1890 case 'n': *p++ = '\n'; break;
1891 case 'r': *p++ = '\r'; break;
1892 case 'v': *p++ = '\013'; break; /* VT */
1893 case 'a': *p++ = '\007'; break; /* BEL, not classic C */
1894
1895 /* \OOO (octal) escapes */
1896 case '0': case '1': case '2': case '3':
1897 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001898 x = s[-1] - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00001899 if ('0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001900 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00001901 if ('0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001902 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00001903 }
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001904 *p++ = x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001905 break;
1906
Fredrik Lundhccc74732001-02-18 22:13:49 +00001907 /* hex escapes */
1908 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001909 case 'x':
Fredrik Lundhccc74732001-02-18 22:13:49 +00001910 digits = 2;
1911 message = "truncated \\xXX escape";
1912 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001913
Fredrik Lundhccc74732001-02-18 22:13:49 +00001914 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001915 case 'u':
Fredrik Lundhccc74732001-02-18 22:13:49 +00001916 digits = 4;
1917 message = "truncated \\uXXXX escape";
1918 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001919
Fredrik Lundhccc74732001-02-18 22:13:49 +00001920 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00001921 case 'U':
Fredrik Lundhccc74732001-02-18 22:13:49 +00001922 digits = 8;
1923 message = "truncated \\UXXXXXXXX escape";
1924 hexescape:
1925 chr = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001926 outpos = p-PyUnicode_AS_UNICODE(v);
1927 if (s+digits>end) {
1928 endinpos = size;
1929 if (unicode_decode_call_errorhandler(
1930 errors, &errorHandler,
1931 "unicodeescape", "end of string in escape sequence",
1932 starts, size, &startinpos, &endinpos, &exc, &s,
1933 (PyObject **)&v, &outpos, &p))
1934 goto onError;
1935 goto nextByte;
1936 }
1937 for (i = 0; i < digits; ++i) {
Fredrik Lundhccc74732001-02-18 22:13:49 +00001938 c = (unsigned char) s[i];
Fredrik Lundhdf846752000-09-03 11:29:49 +00001939 if (!isxdigit(c)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001940 endinpos = (s+i+1)-starts;
1941 if (unicode_decode_call_errorhandler(
1942 errors, &errorHandler,
1943 "unicodeescape", message,
1944 starts, size, &startinpos, &endinpos, &exc, &s,
1945 (PyObject **)&v, &outpos, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00001946 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001947 goto nextByte;
Fredrik Lundhdf846752000-09-03 11:29:49 +00001948 }
1949 chr = (chr<<4) & ~0xF;
1950 if (c >= '0' && c <= '9')
1951 chr += c - '0';
1952 else if (c >= 'a' && c <= 'f')
1953 chr += 10 + c - 'a';
1954 else
1955 chr += 10 + c - 'A';
1956 }
1957 s += i;
Jeremy Hylton504de6b2003-10-06 05:08:26 +00001958 if (chr == 0xffffffff && PyErr_Occurred())
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001959 /* _decoding_error will have already written into the
1960 target buffer. */
1961 break;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001962 store:
Fredrik Lundhdf846752000-09-03 11:29:49 +00001963 /* when we get here, chr is a 32-bit unicode character */
1964 if (chr <= 0xffff)
1965 /* UCS-2 character */
1966 *p++ = (Py_UNICODE) chr;
1967 else if (chr <= 0x10ffff) {
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001968 /* UCS-4 character. Either store directly, or as
Walter Dörwald8c077222002-03-25 11:16:18 +00001969 surrogate pair. */
Fredrik Lundh8f455852001-06-27 18:59:43 +00001970#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001971 *p++ = chr;
1972#else
Fredrik Lundhdf846752000-09-03 11:29:49 +00001973 chr -= 0x10000L;
1974 *p++ = 0xD800 + (Py_UNICODE) (chr >> 10);
Fredrik Lundh45714e92001-06-26 16:39:36 +00001975 *p++ = 0xDC00 + (Py_UNICODE) (chr & 0x03FF);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001976#endif
Fredrik Lundhdf846752000-09-03 11:29:49 +00001977 } else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001978 endinpos = s-starts;
1979 outpos = p-PyUnicode_AS_UNICODE(v);
1980 if (unicode_decode_call_errorhandler(
1981 errors, &errorHandler,
1982 "unicodeescape", "illegal Unicode character",
1983 starts, size, &startinpos, &endinpos, &exc, &s,
1984 (PyObject **)&v, &outpos, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00001985 goto onError;
1986 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00001987 break;
1988
1989 /* \N{name} */
1990 case 'N':
1991 message = "malformed \\N character escape";
1992 if (ucnhash_CAPI == NULL) {
1993 /* load the unicode data module */
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00001994 PyObject *m, *api;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001995 m = PyImport_ImportModule("unicodedata");
1996 if (m == NULL)
1997 goto ucnhashError;
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00001998 api = PyObject_GetAttrString(m, "ucnhash_CAPI");
Fredrik Lundhccc74732001-02-18 22:13:49 +00001999 Py_DECREF(m);
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00002000 if (api == NULL)
Fredrik Lundhccc74732001-02-18 22:13:49 +00002001 goto ucnhashError;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002002 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCObject_AsVoidPtr(api);
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00002003 Py_DECREF(api);
Fredrik Lundhccc74732001-02-18 22:13:49 +00002004 if (ucnhash_CAPI == NULL)
2005 goto ucnhashError;
2006 }
2007 if (*s == '{') {
2008 const char *start = s+1;
2009 /* look for the closing brace */
2010 while (*s != '}' && s < end)
2011 s++;
2012 if (s > start && s < end && *s == '}') {
2013 /* found a name. look it up in the unicode database */
2014 message = "unknown Unicode character name";
2015 s++;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +00002016 if (ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1), &chr))
Fredrik Lundhccc74732001-02-18 22:13:49 +00002017 goto store;
2018 }
2019 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002020 endinpos = s-starts;
2021 outpos = p-PyUnicode_AS_UNICODE(v);
2022 if (unicode_decode_call_errorhandler(
2023 errors, &errorHandler,
2024 "unicodeescape", message,
2025 starts, size, &startinpos, &endinpos, &exc, &s,
2026 (PyObject **)&v, &outpos, &p))
Fredrik Lundhccc74732001-02-18 22:13:49 +00002027 goto onError;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002028 break;
2029
2030 default:
Walter Dörwald8c077222002-03-25 11:16:18 +00002031 if (s > end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002032 message = "\\ at end of string";
2033 s--;
2034 endinpos = s-starts;
2035 outpos = p-PyUnicode_AS_UNICODE(v);
2036 if (unicode_decode_call_errorhandler(
2037 errors, &errorHandler,
2038 "unicodeescape", message,
2039 starts, size, &startinpos, &endinpos, &exc, &s,
2040 (PyObject **)&v, &outpos, &p))
Walter Dörwald8c077222002-03-25 11:16:18 +00002041 goto onError;
2042 }
2043 else {
2044 *p++ = '\\';
2045 *p++ = (unsigned char)s[-1];
2046 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00002047 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002048 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002049 nextByte:
2050 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002051 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002052 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002053 goto onError;
Walter Dörwaldd4ade082003-08-15 15:00:26 +00002054 Py_XDECREF(errorHandler);
2055 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002056 return (PyObject *)v;
Walter Dörwald8c077222002-03-25 11:16:18 +00002057
Fredrik Lundhccc74732001-02-18 22:13:49 +00002058ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00002059 PyErr_SetString(
2060 PyExc_UnicodeError,
2061 "\\N escapes not supported (can't load unicodedata module)"
2062 );
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00002063 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002064 Py_XDECREF(errorHandler);
2065 Py_XDECREF(exc);
Fredrik Lundhf6056062001-01-20 11:15:25 +00002066 return NULL;
2067
Fredrik Lundhccc74732001-02-18 22:13:49 +00002068onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00002069 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002070 Py_XDECREF(errorHandler);
2071 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002072 return NULL;
2073}
2074
2075/* Return a Unicode-Escape string version of the Unicode object.
2076
2077 If quotes is true, the string is enclosed in u"" or u'' quotes as
2078 appropriate.
2079
2080*/
2081
Thomas Wouters477c8d52006-05-27 19:21:47 +00002082Py_LOCAL_INLINE(const Py_UNICODE *) findchar(const Py_UNICODE *s,
2083 Py_ssize_t size,
2084 Py_UNICODE ch)
2085{
2086 /* like wcschr, but doesn't stop at NULL characters */
2087
2088 while (size-- > 0) {
2089 if (*s == ch)
2090 return s;
2091 s++;
2092 }
2093
2094 return NULL;
2095}
Barry Warsaw51ac5802000-03-20 16:36:48 +00002096
Walter Dörwald79e913e2007-05-12 11:08:06 +00002097static const char *hexdigits = "0123456789abcdef";
2098
2099PyObject *PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
2100 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002101{
2102 PyObject *repr;
2103 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002104
Thomas Wouters89f507f2006-12-13 04:49:30 +00002105 /* XXX(nnorwitz): rather than over-allocating, it would be
2106 better to choose a different scheme. Perhaps scan the
2107 first N-chars of the string and allocate based on that size.
2108 */
2109 /* Initial allocation is based on the longest-possible unichr
2110 escape.
2111
2112 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
2113 unichr, so in this case it's the longest unichr escape. In
2114 narrow (UTF-16) builds this is five chars per source unichr
2115 since there are two unichrs in the surrogate pair, so in narrow
2116 (UTF-16) builds it's not the longest unichr escape.
2117
2118 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
2119 so in the narrow (UTF-16) build case it's the longest unichr
2120 escape.
2121 */
2122
Walter Dörwald79e913e2007-05-12 11:08:06 +00002123 repr = PyBytes_FromStringAndSize(NULL,
Thomas Wouters89f507f2006-12-13 04:49:30 +00002124#ifdef Py_UNICODE_WIDE
2125 + 10*size
2126#else
2127 + 6*size
2128#endif
2129 + 1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002130 if (repr == NULL)
2131 return NULL;
2132
Walter Dörwald79e913e2007-05-12 11:08:06 +00002133 p = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002134
Guido van Rossumd57fd912000-03-10 22:53:23 +00002135 while (size-- > 0) {
2136 Py_UNICODE ch = *s++;
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002137
Walter Dörwald79e913e2007-05-12 11:08:06 +00002138 /* Escape backslashes */
2139 if (ch == '\\') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002140 *p++ = '\\';
2141 *p++ = (char) ch;
Walter Dörwald79e913e2007-05-12 11:08:06 +00002142 continue;
Tim Petersced69f82003-09-16 20:30:58 +00002143 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002144
Guido van Rossum0d42e0c2001-07-20 16:36:21 +00002145#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002146 /* Map 21-bit characters to '\U00xxxxxx' */
2147 else if (ch >= 0x10000) {
2148 *p++ = '\\';
2149 *p++ = 'U';
Walter Dörwald79e913e2007-05-12 11:08:06 +00002150 *p++ = hexdigits[(ch >> 28) & 0x0000000F];
2151 *p++ = hexdigits[(ch >> 24) & 0x0000000F];
2152 *p++ = hexdigits[(ch >> 20) & 0x0000000F];
2153 *p++ = hexdigits[(ch >> 16) & 0x0000000F];
2154 *p++ = hexdigits[(ch >> 12) & 0x0000000F];
2155 *p++ = hexdigits[(ch >> 8) & 0x0000000F];
2156 *p++ = hexdigits[(ch >> 4) & 0x0000000F];
2157 *p++ = hexdigits[ch & 0x0000000F];
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002158 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002159 }
Thomas Wouters89f507f2006-12-13 04:49:30 +00002160#else
2161 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002162 else if (ch >= 0xD800 && ch < 0xDC00) {
2163 Py_UNICODE ch2;
2164 Py_UCS4 ucs;
Tim Petersced69f82003-09-16 20:30:58 +00002165
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002166 ch2 = *s++;
2167 size--;
2168 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
2169 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
2170 *p++ = '\\';
2171 *p++ = 'U';
Walter Dörwald79e913e2007-05-12 11:08:06 +00002172 *p++ = hexdigits[(ucs >> 28) & 0x0000000F];
2173 *p++ = hexdigits[(ucs >> 24) & 0x0000000F];
2174 *p++ = hexdigits[(ucs >> 20) & 0x0000000F];
2175 *p++ = hexdigits[(ucs >> 16) & 0x0000000F];
2176 *p++ = hexdigits[(ucs >> 12) & 0x0000000F];
2177 *p++ = hexdigits[(ucs >> 8) & 0x0000000F];
2178 *p++ = hexdigits[(ucs >> 4) & 0x0000000F];
2179 *p++ = hexdigits[ucs & 0x0000000F];
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002180 continue;
2181 }
2182 /* Fall through: isolated surrogates are copied as-is */
2183 s--;
2184 size++;
2185 }
Thomas Wouters89f507f2006-12-13 04:49:30 +00002186#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002187
Guido van Rossumd57fd912000-03-10 22:53:23 +00002188 /* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002189 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002190 *p++ = '\\';
2191 *p++ = 'u';
Walter Dörwald79e913e2007-05-12 11:08:06 +00002192 *p++ = hexdigits[(ch >> 12) & 0x000F];
2193 *p++ = hexdigits[(ch >> 8) & 0x000F];
2194 *p++ = hexdigits[(ch >> 4) & 0x000F];
2195 *p++ = hexdigits[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00002196 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002197
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00002198 /* Map special whitespace to '\t', \n', '\r' */
2199 else if (ch == '\t') {
2200 *p++ = '\\';
2201 *p++ = 't';
2202 }
2203 else if (ch == '\n') {
2204 *p++ = '\\';
2205 *p++ = 'n';
2206 }
2207 else if (ch == '\r') {
2208 *p++ = '\\';
2209 *p++ = 'r';
2210 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002211
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00002212 /* Map non-printable US ASCII to '\xhh' */
Marc-André Lemburg11326de2001-11-28 12:56:20 +00002213 else if (ch < ' ' || ch >= 0x7F) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002214 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00002215 *p++ = 'x';
Walter Dörwald79e913e2007-05-12 11:08:06 +00002216 *p++ = hexdigits[(ch >> 4) & 0x000F];
2217 *p++ = hexdigits[ch & 0x000F];
Tim Petersced69f82003-09-16 20:30:58 +00002218 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002219
Guido van Rossumd57fd912000-03-10 22:53:23 +00002220 /* Copy everything else as-is */
2221 else
2222 *p++ = (char) ch;
2223 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002224
2225 *p = '\0';
Walter Dörwald79e913e2007-05-12 11:08:06 +00002226 if (PyBytes_Resize(repr, p - PyBytes_AS_STRING(repr))) {
2227 Py_DECREF(repr);
2228 return NULL;
2229 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002230 return repr;
2231}
2232
Guido van Rossumd57fd912000-03-10 22:53:23 +00002233PyObject *PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
2234{
Walter Dörwald79e913e2007-05-12 11:08:06 +00002235 PyObject *s, *result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002236 if (!PyUnicode_Check(unicode)) {
2237 PyErr_BadArgument();
2238 return NULL;
2239 }
Walter Dörwald79e913e2007-05-12 11:08:06 +00002240 s = PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
2241 PyUnicode_GET_SIZE(unicode));
2242
2243 if (!s)
2244 return NULL;
2245 result = PyString_FromStringAndSize(PyBytes_AS_STRING(s),
2246 PyBytes_GET_SIZE(s));
2247 Py_DECREF(s);
2248 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002249}
2250
2251/* --- Raw Unicode Escape Codec ------------------------------------------- */
2252
2253PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002254 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00002255 const char *errors)
2256{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002257 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002258 Py_ssize_t startinpos;
2259 Py_ssize_t endinpos;
2260 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002261 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002262 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002263 const char *end;
2264 const char *bs;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002265 PyObject *errorHandler = NULL;
2266 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00002267
Guido van Rossumd57fd912000-03-10 22:53:23 +00002268 /* Escaped strings will always be longer than the resulting
2269 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002270 length after conversion to the true value. (But decoding error
2271 handler might have to resize the string) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002272 v = _PyUnicode_New(size);
2273 if (v == NULL)
2274 goto onError;
2275 if (size == 0)
2276 return (PyObject *)v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002277 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002278 end = s + size;
2279 while (s < end) {
2280 unsigned char c;
Martin v. Löwis047c05e2002-03-21 08:55:28 +00002281 Py_UCS4 x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002282 int i;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002283 int count;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002284
2285 /* Non-escape characters are interpreted as Unicode ordinals */
2286 if (*s != '\\') {
2287 *p++ = (unsigned char)*s++;
2288 continue;
2289 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002290 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002291
2292 /* \u-escapes are only interpreted iff the number of leading
2293 backslashes if odd */
2294 bs = s;
2295 for (;s < end;) {
2296 if (*s != '\\')
2297 break;
2298 *p++ = (unsigned char)*s++;
2299 }
2300 if (((s - bs) & 1) == 0 ||
2301 s >= end ||
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002302 (*s != 'u' && *s != 'U')) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002303 continue;
2304 }
2305 p--;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002306 count = *s=='u' ? 4 : 8;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002307 s++;
2308
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002309 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002310 outpos = p-PyUnicode_AS_UNICODE(v);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002311 for (x = 0, i = 0; i < count; ++i, ++s) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002312 c = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002313 if (!isxdigit(c)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002314 endinpos = s-starts;
2315 if (unicode_decode_call_errorhandler(
2316 errors, &errorHandler,
2317 "rawunicodeescape", "truncated \\uXXXX",
2318 starts, size, &startinpos, &endinpos, &exc, &s,
2319 (PyObject **)&v, &outpos, &p))
Guido van Rossumd57fd912000-03-10 22:53:23 +00002320 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002321 goto nextByte;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002322 }
2323 x = (x<<4) & ~0xF;
2324 if (c >= '0' && c <= '9')
2325 x += c - '0';
2326 else if (c >= 'a' && c <= 'f')
2327 x += 10 + c - 'a';
2328 else
2329 x += 10 + c - 'A';
2330 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002331#ifndef Py_UNICODE_WIDE
2332 if (x > 0x10000) {
2333 if (unicode_decode_call_errorhandler(
2334 errors, &errorHandler,
2335 "rawunicodeescape", "\\Uxxxxxxxx out of range",
2336 starts, size, &startinpos, &endinpos, &exc, &s,
2337 (PyObject **)&v, &outpos, &p))
2338 goto onError;
2339 }
2340#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002341 *p++ = x;
2342 nextByte:
2343 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002344 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002345 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002346 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002347 Py_XDECREF(errorHandler);
2348 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002349 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00002350
Guido van Rossumd57fd912000-03-10 22:53:23 +00002351 onError:
2352 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002353 Py_XDECREF(errorHandler);
2354 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002355 return NULL;
2356}
2357
2358PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002359 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002360{
2361 PyObject *repr;
2362 char *p;
2363 char *q;
2364
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002365#ifdef Py_UNICODE_WIDE
Walter Dörwald711005d2007-05-12 12:03:26 +00002366 repr = PyBytes_FromStringAndSize(NULL, 10 * size);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002367#else
Walter Dörwald711005d2007-05-12 12:03:26 +00002368 repr = PyBytes_FromStringAndSize(NULL, 6 * size);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002369#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00002370 if (repr == NULL)
2371 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00002372 if (size == 0)
2373 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002374
Walter Dörwald711005d2007-05-12 12:03:26 +00002375 p = q = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002376 while (size-- > 0) {
2377 Py_UNICODE ch = *s++;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002378#ifdef Py_UNICODE_WIDE
2379 /* Map 32-bit characters to '\Uxxxxxxxx' */
2380 if (ch >= 0x10000) {
2381 *p++ = '\\';
2382 *p++ = 'U';
Walter Dörwalddb5d33e2007-05-12 11:13:47 +00002383 *p++ = hexdigits[(ch >> 28) & 0xf];
2384 *p++ = hexdigits[(ch >> 24) & 0xf];
2385 *p++ = hexdigits[(ch >> 20) & 0xf];
2386 *p++ = hexdigits[(ch >> 16) & 0xf];
2387 *p++ = hexdigits[(ch >> 12) & 0xf];
2388 *p++ = hexdigits[(ch >> 8) & 0xf];
2389 *p++ = hexdigits[(ch >> 4) & 0xf];
2390 *p++ = hexdigits[ch & 15];
Tim Petersced69f82003-09-16 20:30:58 +00002391 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002392 else
2393#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00002394 /* Map 16-bit characters to '\uxxxx' */
2395 if (ch >= 256) {
2396 *p++ = '\\';
2397 *p++ = 'u';
Walter Dörwalddb5d33e2007-05-12 11:13:47 +00002398 *p++ = hexdigits[(ch >> 12) & 0xf];
2399 *p++ = hexdigits[(ch >> 8) & 0xf];
2400 *p++ = hexdigits[(ch >> 4) & 0xf];
2401 *p++ = hexdigits[ch & 15];
Guido van Rossumd57fd912000-03-10 22:53:23 +00002402 }
2403 /* Copy everything else as-is */
2404 else
2405 *p++ = (char) ch;
2406 }
2407 *p = '\0';
Walter Dörwald711005d2007-05-12 12:03:26 +00002408 if (PyBytes_Resize(repr, p - q)) {
2409 Py_DECREF(repr);
2410 return NULL;
2411 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002412 return repr;
2413}
2414
2415PyObject *PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
2416{
Walter Dörwald711005d2007-05-12 12:03:26 +00002417 PyObject *s, *result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002418 if (!PyUnicode_Check(unicode)) {
Walter Dörwald711005d2007-05-12 12:03:26 +00002419 PyErr_BadArgument();
2420 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002421 }
Walter Dörwald711005d2007-05-12 12:03:26 +00002422 s = PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
2423 PyUnicode_GET_SIZE(unicode));
2424
2425 if (!s)
2426 return NULL;
2427 result = PyString_FromStringAndSize(PyBytes_AS_STRING(s),
2428 PyBytes_GET_SIZE(s));
2429 Py_DECREF(s);
2430 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002431}
2432
Walter Dörwalda47d1c02005-08-30 10:23:14 +00002433/* --- Unicode Internal Codec ------------------------------------------- */
2434
2435PyObject *_PyUnicode_DecodeUnicodeInternal(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002436 Py_ssize_t size,
Walter Dörwalda47d1c02005-08-30 10:23:14 +00002437 const char *errors)
2438{
2439 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002440 Py_ssize_t startinpos;
2441 Py_ssize_t endinpos;
2442 Py_ssize_t outpos;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00002443 PyUnicodeObject *v;
2444 Py_UNICODE *p;
2445 const char *end;
2446 const char *reason;
2447 PyObject *errorHandler = NULL;
2448 PyObject *exc = NULL;
2449
Neal Norwitzd43069c2006-01-08 01:12:10 +00002450#ifdef Py_UNICODE_WIDE
2451 Py_UNICODE unimax = PyUnicode_GetMax();
2452#endif
2453
Thomas Wouters89f507f2006-12-13 04:49:30 +00002454 /* XXX overflow detection missing */
Walter Dörwalda47d1c02005-08-30 10:23:14 +00002455 v = _PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE);
2456 if (v == NULL)
2457 goto onError;
2458 if (PyUnicode_GetSize((PyObject *)v) == 0)
2459 return (PyObject *)v;
2460 p = PyUnicode_AS_UNICODE(v);
2461 end = s + size;
2462
2463 while (s < end) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00002464 memcpy(p, s, sizeof(Py_UNICODE));
Walter Dörwalda47d1c02005-08-30 10:23:14 +00002465 /* We have to sanity check the raw data, otherwise doom looms for
2466 some malformed UCS-4 data. */
2467 if (
2468 #ifdef Py_UNICODE_WIDE
2469 *p > unimax || *p < 0 ||
2470 #endif
2471 end-s < Py_UNICODE_SIZE
2472 )
2473 {
2474 startinpos = s - starts;
2475 if (end-s < Py_UNICODE_SIZE) {
2476 endinpos = end-starts;
2477 reason = "truncated input";
2478 }
2479 else {
2480 endinpos = s - starts + Py_UNICODE_SIZE;
2481 reason = "illegal code point (> 0x10FFFF)";
2482 }
2483 outpos = p - PyUnicode_AS_UNICODE(v);
2484 if (unicode_decode_call_errorhandler(
2485 errors, &errorHandler,
2486 "unicode_internal", reason,
2487 starts, size, &startinpos, &endinpos, &exc, &s,
2488 (PyObject **)&v, &outpos, &p)) {
2489 goto onError;
2490 }
2491 }
2492 else {
2493 p++;
2494 s += Py_UNICODE_SIZE;
2495 }
2496 }
2497
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002498 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00002499 goto onError;
2500 Py_XDECREF(errorHandler);
2501 Py_XDECREF(exc);
2502 return (PyObject *)v;
2503
2504 onError:
2505 Py_XDECREF(v);
2506 Py_XDECREF(errorHandler);
2507 Py_XDECREF(exc);
2508 return NULL;
2509}
2510
Guido van Rossumd57fd912000-03-10 22:53:23 +00002511/* --- Latin-1 Codec ------------------------------------------------------ */
2512
2513PyObject *PyUnicode_DecodeLatin1(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002514 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00002515 const char *errors)
2516{
2517 PyUnicodeObject *v;
2518 Py_UNICODE *p;
Tim Petersced69f82003-09-16 20:30:58 +00002519
Guido van Rossumd57fd912000-03-10 22:53:23 +00002520 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002521 if (size == 1) {
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002522 Py_UNICODE r = *(unsigned char*)s;
2523 return PyUnicode_FromUnicode(&r, 1);
2524 }
2525
Guido van Rossumd57fd912000-03-10 22:53:23 +00002526 v = _PyUnicode_New(size);
2527 if (v == NULL)
2528 goto onError;
2529 if (size == 0)
2530 return (PyObject *)v;
2531 p = PyUnicode_AS_UNICODE(v);
2532 while (size-- > 0)
2533 *p++ = (unsigned char)*s++;
2534 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00002535
Guido van Rossumd57fd912000-03-10 22:53:23 +00002536 onError:
2537 Py_XDECREF(v);
2538 return NULL;
2539}
2540
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002541/* create or adjust a UnicodeEncodeError */
2542static void make_encode_exception(PyObject **exceptionObject,
2543 const char *encoding,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002544 const Py_UNICODE *unicode, Py_ssize_t size,
2545 Py_ssize_t startpos, Py_ssize_t endpos,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002546 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002547{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002548 if (*exceptionObject == NULL) {
2549 *exceptionObject = PyUnicodeEncodeError_Create(
2550 encoding, unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002551 }
2552 else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002553 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
2554 goto onError;
2555 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
2556 goto onError;
2557 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
2558 goto onError;
2559 return;
2560 onError:
2561 Py_DECREF(*exceptionObject);
2562 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002563 }
2564}
2565
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002566/* raises a UnicodeEncodeError */
2567static void raise_encode_exception(PyObject **exceptionObject,
2568 const char *encoding,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002569 const Py_UNICODE *unicode, Py_ssize_t size,
2570 Py_ssize_t startpos, Py_ssize_t endpos,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002571 const char *reason)
2572{
2573 make_encode_exception(exceptionObject,
2574 encoding, unicode, size, startpos, endpos, reason);
2575 if (*exceptionObject != NULL)
2576 PyCodec_StrictErrors(*exceptionObject);
2577}
2578
2579/* error handling callback helper:
2580 build arguments, call the callback and check the arguments,
2581 put the result into newpos and return the replacement string, which
2582 has to be freed by the caller */
2583static PyObject *unicode_encode_call_errorhandler(const char *errors,
2584 PyObject **errorHandler,
2585 const char *encoding, const char *reason,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002586 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
2587 Py_ssize_t startpos, Py_ssize_t endpos,
2588 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002589{
Martin v. Löwis18e16552006-02-15 17:27:45 +00002590 static char *argparse = "O!n;encoding error handler must return (unicode, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002591
2592 PyObject *restuple;
2593 PyObject *resunicode;
2594
2595 if (*errorHandler == NULL) {
2596 *errorHandler = PyCodec_LookupError(errors);
2597 if (*errorHandler == NULL)
2598 return NULL;
2599 }
2600
2601 make_encode_exception(exceptionObject,
2602 encoding, unicode, size, startpos, endpos, reason);
2603 if (*exceptionObject == NULL)
2604 return NULL;
2605
2606 restuple = PyObject_CallFunctionObjArgs(
2607 *errorHandler, *exceptionObject, NULL);
2608 if (restuple == NULL)
2609 return NULL;
2610 if (!PyTuple_Check(restuple)) {
2611 PyErr_Format(PyExc_TypeError, &argparse[4]);
2612 Py_DECREF(restuple);
2613 return NULL;
2614 }
2615 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
2616 &resunicode, newpos)) {
2617 Py_DECREF(restuple);
2618 return NULL;
2619 }
2620 if (*newpos<0)
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00002621 *newpos = size+*newpos;
2622 if (*newpos<0 || *newpos>size) {
Martin v. Löwis2c95cc62006-02-16 06:54:25 +00002623 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00002624 Py_DECREF(restuple);
2625 return NULL;
2626 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002627 Py_INCREF(resunicode);
2628 Py_DECREF(restuple);
2629 return resunicode;
2630}
2631
2632static PyObject *unicode_encode_ucs1(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002633 Py_ssize_t size,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002634 const char *errors,
2635 int limit)
2636{
2637 /* output object */
2638 PyObject *res;
2639 /* pointers to the beginning and end+1 of input */
2640 const Py_UNICODE *startp = p;
2641 const Py_UNICODE *endp = p + size;
2642 /* pointer to the beginning of the unencodable characters */
2643 /* const Py_UNICODE *badp = NULL; */
2644 /* pointer into the output */
2645 char *str;
2646 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00002647 Py_ssize_t respos = 0;
2648 Py_ssize_t ressize;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002649 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
2650 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002651 PyObject *errorHandler = NULL;
2652 PyObject *exc = NULL;
2653 /* the following variable is used for caching string comparisons
2654 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
2655 int known_errorHandler = -1;
2656
2657 /* allocate enough for a simple encoding without
2658 replacements, if we need more, we'll resize */
Guido van Rossumf15a29f2007-05-04 00:41:39 +00002659 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002660 if (res == NULL)
2661 goto onError;
2662 if (size == 0)
2663 return res;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00002664 str = PyBytes_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002665 ressize = size;
2666
2667 while (p<endp) {
2668 Py_UNICODE c = *p;
2669
2670 /* can we encode this? */
2671 if (c<limit) {
2672 /* no overflow check, because we know that the space is enough */
2673 *str++ = (char)c;
2674 ++p;
2675 }
2676 else {
Martin v. Löwis18e16552006-02-15 17:27:45 +00002677 Py_ssize_t unicodepos = p-startp;
2678 Py_ssize_t requiredsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002679 PyObject *repunicode;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002680 Py_ssize_t repsize;
2681 Py_ssize_t newpos;
2682 Py_ssize_t respos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002683 Py_UNICODE *uni2;
2684 /* startpos for collecting unencodable chars */
2685 const Py_UNICODE *collstart = p;
2686 const Py_UNICODE *collend = p;
2687 /* find all unecodable characters */
2688 while ((collend < endp) && ((*collend)>=limit))
2689 ++collend;
2690 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
2691 if (known_errorHandler==-1) {
2692 if ((errors==NULL) || (!strcmp(errors, "strict")))
2693 known_errorHandler = 1;
2694 else if (!strcmp(errors, "replace"))
2695 known_errorHandler = 2;
2696 else if (!strcmp(errors, "ignore"))
2697 known_errorHandler = 3;
2698 else if (!strcmp(errors, "xmlcharrefreplace"))
2699 known_errorHandler = 4;
2700 else
2701 known_errorHandler = 0;
2702 }
2703 switch (known_errorHandler) {
2704 case 1: /* strict */
2705 raise_encode_exception(&exc, encoding, startp, size, collstart-startp, collend-startp, reason);
2706 goto onError;
2707 case 2: /* replace */
2708 while (collstart++<collend)
2709 *str++ = '?'; /* fall through */
2710 case 3: /* ignore */
2711 p = collend;
2712 break;
2713 case 4: /* xmlcharrefreplace */
Guido van Rossumf15a29f2007-05-04 00:41:39 +00002714 respos = str - PyBytes_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002715 /* determine replacement size (temporarily (mis)uses p) */
2716 for (p = collstart, repsize = 0; p < collend; ++p) {
2717 if (*p<10)
2718 repsize += 2+1+1;
2719 else if (*p<100)
2720 repsize += 2+2+1;
2721 else if (*p<1000)
2722 repsize += 2+3+1;
2723 else if (*p<10000)
2724 repsize += 2+4+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00002725#ifndef Py_UNICODE_WIDE
2726 else
2727 repsize += 2+5+1;
2728#else
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002729 else if (*p<100000)
2730 repsize += 2+5+1;
2731 else if (*p<1000000)
2732 repsize += 2+6+1;
2733 else
2734 repsize += 2+7+1;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002735#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002736 }
2737 requiredsize = respos+repsize+(endp-collend);
2738 if (requiredsize > ressize) {
2739 if (requiredsize<2*ressize)
2740 requiredsize = 2*ressize;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00002741 if (PyBytes_Resize(res, requiredsize))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002742 goto onError;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00002743 str = PyBytes_AS_STRING(res) + respos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002744 ressize = requiredsize;
2745 }
2746 /* generate replacement (temporarily (mis)uses p) */
2747 for (p = collstart; p < collend; ++p) {
2748 str += sprintf(str, "&#%d;", (int)*p);
2749 }
2750 p = collend;
2751 break;
2752 default:
2753 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
2754 encoding, reason, startp, size, &exc,
2755 collstart-startp, collend-startp, &newpos);
2756 if (repunicode == NULL)
2757 goto onError;
2758 /* need more space? (at least enough for what we
2759 have+the replacement+the rest of the string, so
2760 we won't have to check space for encodable characters) */
Guido van Rossumf15a29f2007-05-04 00:41:39 +00002761 respos = str - PyBytes_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002762 repsize = PyUnicode_GET_SIZE(repunicode);
2763 requiredsize = respos+repsize+(endp-collend);
2764 if (requiredsize > ressize) {
2765 if (requiredsize<2*ressize)
2766 requiredsize = 2*ressize;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00002767 if (PyBytes_Resize(res, requiredsize)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002768 Py_DECREF(repunicode);
2769 goto onError;
2770 }
Guido van Rossumf15a29f2007-05-04 00:41:39 +00002771 str = PyBytes_AS_STRING(res) + respos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002772 ressize = requiredsize;
2773 }
2774 /* check if there is anything unencodable in the replacement
2775 and copy it to the output */
2776 for (uni2 = PyUnicode_AS_UNICODE(repunicode);repsize-->0; ++uni2, ++str) {
2777 c = *uni2;
2778 if (c >= limit) {
2779 raise_encode_exception(&exc, encoding, startp, size,
2780 unicodepos, unicodepos+1, reason);
2781 Py_DECREF(repunicode);
2782 goto onError;
2783 }
2784 *str = (char)c;
2785 }
2786 p = startp + newpos;
2787 Py_DECREF(repunicode);
2788 }
2789 }
2790 }
2791 /* Resize if we allocated to much */
Guido van Rossumf15a29f2007-05-04 00:41:39 +00002792 respos = str - PyBytes_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002793 if (respos<ressize)
2794 /* If this falls res will be NULL */
Guido van Rossumf15a29f2007-05-04 00:41:39 +00002795 PyBytes_Resize(res, respos);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002796 Py_XDECREF(errorHandler);
2797 Py_XDECREF(exc);
2798 return res;
2799
2800 onError:
2801 Py_XDECREF(res);
2802 Py_XDECREF(errorHandler);
2803 Py_XDECREF(exc);
2804 return NULL;
2805}
2806
Guido van Rossumd57fd912000-03-10 22:53:23 +00002807PyObject *PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002808 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00002809 const char *errors)
2810{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002811 return unicode_encode_ucs1(p, size, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002812}
2813
2814PyObject *PyUnicode_AsLatin1String(PyObject *unicode)
2815{
2816 if (!PyUnicode_Check(unicode)) {
2817 PyErr_BadArgument();
2818 return NULL;
2819 }
2820 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
2821 PyUnicode_GET_SIZE(unicode),
2822 NULL);
2823}
2824
2825/* --- 7-bit ASCII Codec -------------------------------------------------- */
2826
Guido van Rossumd57fd912000-03-10 22:53:23 +00002827PyObject *PyUnicode_DecodeASCII(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002828 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00002829 const char *errors)
2830{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002831 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002832 PyUnicodeObject *v;
2833 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002834 Py_ssize_t startinpos;
2835 Py_ssize_t endinpos;
2836 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002837 const char *e;
2838 PyObject *errorHandler = NULL;
2839 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00002840
Guido van Rossumd57fd912000-03-10 22:53:23 +00002841 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002842 if (size == 1 && *(unsigned char*)s < 128) {
2843 Py_UNICODE r = *(unsigned char*)s;
2844 return PyUnicode_FromUnicode(&r, 1);
2845 }
Tim Petersced69f82003-09-16 20:30:58 +00002846
Guido van Rossumd57fd912000-03-10 22:53:23 +00002847 v = _PyUnicode_New(size);
2848 if (v == NULL)
2849 goto onError;
2850 if (size == 0)
2851 return (PyObject *)v;
2852 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002853 e = s + size;
2854 while (s < e) {
2855 register unsigned char c = (unsigned char)*s;
2856 if (c < 128) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002857 *p++ = c;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002858 ++s;
2859 }
2860 else {
2861 startinpos = s-starts;
2862 endinpos = startinpos + 1;
Jeremy Hyltond8082792003-09-16 19:41:39 +00002863 outpos = p - (Py_UNICODE *)PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002864 if (unicode_decode_call_errorhandler(
2865 errors, &errorHandler,
2866 "ascii", "ordinal not in range(128)",
2867 starts, size, &startinpos, &endinpos, &exc, &s,
2868 (PyObject **)&v, &outpos, &p))
Guido van Rossumd57fd912000-03-10 22:53:23 +00002869 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002870 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002871 }
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002872 if (p - PyUnicode_AS_UNICODE(v) < PyString_GET_SIZE(v))
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002873 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002874 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002875 Py_XDECREF(errorHandler);
2876 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002877 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00002878
Guido van Rossumd57fd912000-03-10 22:53:23 +00002879 onError:
2880 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002881 Py_XDECREF(errorHandler);
2882 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002883 return NULL;
2884}
2885
Guido van Rossumd57fd912000-03-10 22:53:23 +00002886PyObject *PyUnicode_EncodeASCII(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002887 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00002888 const char *errors)
2889{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002890 return unicode_encode_ucs1(p, size, errors, 128);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002891}
2892
2893PyObject *PyUnicode_AsASCIIString(PyObject *unicode)
2894{
2895 if (!PyUnicode_Check(unicode)) {
2896 PyErr_BadArgument();
2897 return NULL;
2898 }
2899 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
2900 PyUnicode_GET_SIZE(unicode),
2901 NULL);
2902}
2903
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00002904#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Guido van Rossum2ea3e142000-03-31 17:24:09 +00002905
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002906/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00002907
Thomas Wouters0e3f5912006-08-11 14:57:12 +00002908#if SIZEOF_INT < SIZEOF_SSIZE_T
2909#define NEED_RETRY
2910#endif
2911
2912/* XXX This code is limited to "true" double-byte encodings, as
2913 a) it assumes an incomplete character consists of a single byte, and
2914 b) IsDBCSLeadByte (probably) does not work for non-DBCS multi-byte
2915 encodings, see IsDBCSLeadByteEx documentation. */
2916
2917static int is_dbcs_lead_byte(const char *s, int offset)
2918{
2919 const char *curr = s + offset;
2920
2921 if (IsDBCSLeadByte(*curr)) {
2922 const char *prev = CharPrev(s, curr);
2923 return (prev == curr) || !IsDBCSLeadByte(*prev) || (curr - prev == 2);
2924 }
2925 return 0;
2926}
2927
2928/*
2929 * Decode MBCS string into unicode object. If 'final' is set, converts
2930 * trailing lead-byte too. Returns consumed size if succeed, -1 otherwise.
2931 */
2932static int decode_mbcs(PyUnicodeObject **v,
2933 const char *s, /* MBCS string */
2934 int size, /* sizeof MBCS string */
2935 int final)
2936{
2937 Py_UNICODE *p;
2938 Py_ssize_t n = 0;
2939 int usize = 0;
2940
2941 assert(size >= 0);
2942
2943 /* Skip trailing lead-byte unless 'final' is set */
2944 if (!final && size >= 1 && is_dbcs_lead_byte(s, size - 1))
2945 --size;
2946
2947 /* First get the size of the result */
2948 if (size > 0) {
2949 usize = MultiByteToWideChar(CP_ACP, 0, s, size, NULL, 0);
2950 if (usize == 0) {
2951 PyErr_SetFromWindowsErrWithFilename(0, NULL);
2952 return -1;
2953 }
2954 }
2955
2956 if (*v == NULL) {
2957 /* Create unicode object */
2958 *v = _PyUnicode_New(usize);
2959 if (*v == NULL)
2960 return -1;
2961 }
2962 else {
2963 /* Extend unicode object */
2964 n = PyUnicode_GET_SIZE(*v);
2965 if (_PyUnicode_Resize(v, n + usize) < 0)
2966 return -1;
2967 }
2968
2969 /* Do the conversion */
2970 if (size > 0) {
2971 p = PyUnicode_AS_UNICODE(*v) + n;
2972 if (0 == MultiByteToWideChar(CP_ACP, 0, s, size, p, usize)) {
2973 PyErr_SetFromWindowsErrWithFilename(0, NULL);
2974 return -1;
2975 }
2976 }
2977
2978 return size;
2979}
2980
2981PyObject *PyUnicode_DecodeMBCSStateful(const char *s,
2982 Py_ssize_t size,
2983 const char *errors,
2984 Py_ssize_t *consumed)
2985{
2986 PyUnicodeObject *v = NULL;
2987 int done;
2988
2989 if (consumed)
2990 *consumed = 0;
2991
2992#ifdef NEED_RETRY
2993 retry:
2994 if (size > INT_MAX)
2995 done = decode_mbcs(&v, s, INT_MAX, 0);
2996 else
2997#endif
2998 done = decode_mbcs(&v, s, (int)size, !consumed);
2999
3000 if (done < 0) {
3001 Py_XDECREF(v);
3002 return NULL;
3003 }
3004
3005 if (consumed)
3006 *consumed += done;
3007
3008#ifdef NEED_RETRY
3009 if (size > INT_MAX) {
3010 s += done;
3011 size -= done;
3012 goto retry;
3013 }
3014#endif
3015
3016 return (PyObject *)v;
3017}
3018
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003019PyObject *PyUnicode_DecodeMBCS(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003020 Py_ssize_t size,
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003021 const char *errors)
3022{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003023 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
3024}
3025
3026/*
3027 * Convert unicode into string object (MBCS).
3028 * Returns 0 if succeed, -1 otherwise.
3029 */
3030static int encode_mbcs(PyObject **repr,
3031 const Py_UNICODE *p, /* unicode */
3032 int size) /* size of unicode */
3033{
3034 int mbcssize = 0;
3035 Py_ssize_t n = 0;
3036
3037 assert(size >= 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003038
3039 /* First get the size of the result */
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003040 if (size > 0) {
3041 mbcssize = WideCharToMultiByte(CP_ACP, 0, p, size, NULL, 0, NULL, NULL);
3042 if (mbcssize == 0) {
3043 PyErr_SetFromWindowsErrWithFilename(0, NULL);
3044 return -1;
3045 }
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003046 }
3047
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003048 if (*repr == NULL) {
3049 /* Create string object */
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003050 *repr = PyBytes_FromStringAndSize(NULL, mbcssize);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003051 if (*repr == NULL)
3052 return -1;
3053 }
3054 else {
3055 /* Extend string object */
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003056 n = PyBytes_Size(*repr);
3057 if (PyBytes_Resize(*repr, n + mbcssize) < 0)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003058 return -1;
3059 }
3060
3061 /* Do the conversion */
3062 if (size > 0) {
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003063 char *s = PyBytes_AS_STRING(*repr) + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003064 if (0 == WideCharToMultiByte(CP_ACP, 0, p, size, s, mbcssize, NULL, NULL)) {
3065 PyErr_SetFromWindowsErrWithFilename(0, NULL);
3066 return -1;
3067 }
3068 }
3069
3070 return 0;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003071}
3072
3073PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003074 Py_ssize_t size,
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003075 const char *errors)
3076{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003077 PyObject *repr = NULL;
3078 int ret;
Guido van Rossum03e29f12000-05-04 15:52:20 +00003079
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003080#ifdef NEED_RETRY
3081 retry:
3082 if (size > INT_MAX)
3083 ret = encode_mbcs(&repr, p, INT_MAX);
3084 else
3085#endif
3086 ret = encode_mbcs(&repr, p, (int)size);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003087
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003088 if (ret < 0) {
3089 Py_XDECREF(repr);
3090 return NULL;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003091 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003092
3093#ifdef NEED_RETRY
3094 if (size > INT_MAX) {
3095 p += INT_MAX;
3096 size -= INT_MAX;
3097 goto retry;
3098 }
3099#endif
3100
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003101 return repr;
3102}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00003103
Mark Hammond0ccda1e2003-07-01 00:13:27 +00003104PyObject *PyUnicode_AsMBCSString(PyObject *unicode)
3105{
3106 if (!PyUnicode_Check(unicode)) {
3107 PyErr_BadArgument();
3108 return NULL;
3109 }
3110 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
3111 PyUnicode_GET_SIZE(unicode),
3112 NULL);
3113}
3114
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003115#undef NEED_RETRY
3116
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00003117#endif /* MS_WINDOWS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003118
Guido van Rossumd57fd912000-03-10 22:53:23 +00003119/* --- Character Mapping Codec -------------------------------------------- */
3120
Guido van Rossumd57fd912000-03-10 22:53:23 +00003121PyObject *PyUnicode_DecodeCharmap(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003122 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003123 PyObject *mapping,
3124 const char *errors)
3125{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003126 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003127 Py_ssize_t startinpos;
3128 Py_ssize_t endinpos;
3129 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003130 const char *e;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003131 PyUnicodeObject *v;
3132 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003133 Py_ssize_t extrachars = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003134 PyObject *errorHandler = NULL;
3135 PyObject *exc = NULL;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003136 Py_UNICODE *mapstring = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003137 Py_ssize_t maplen = 0;
Tim Petersced69f82003-09-16 20:30:58 +00003138
Guido van Rossumd57fd912000-03-10 22:53:23 +00003139 /* Default to Latin-1 */
3140 if (mapping == NULL)
3141 return PyUnicode_DecodeLatin1(s, size, errors);
3142
3143 v = _PyUnicode_New(size);
3144 if (v == NULL)
3145 goto onError;
3146 if (size == 0)
3147 return (PyObject *)v;
3148 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003149 e = s + size;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003150 if (PyUnicode_CheckExact(mapping)) {
3151 mapstring = PyUnicode_AS_UNICODE(mapping);
3152 maplen = PyUnicode_GET_SIZE(mapping);
3153 while (s < e) {
3154 unsigned char ch = *s;
3155 Py_UNICODE x = 0xfffe; /* illegal value */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003156
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003157 if (ch < maplen)
3158 x = mapstring[ch];
Guido van Rossumd57fd912000-03-10 22:53:23 +00003159
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003160 if (x == 0xfffe) {
3161 /* undefined mapping */
3162 outpos = p-PyUnicode_AS_UNICODE(v);
3163 startinpos = s-starts;
3164 endinpos = startinpos+1;
3165 if (unicode_decode_call_errorhandler(
3166 errors, &errorHandler,
3167 "charmap", "character maps to <undefined>",
3168 starts, size, &startinpos, &endinpos, &exc, &s,
3169 (PyObject **)&v, &outpos, &p)) {
3170 goto onError;
Marc-André Lemburgec233e52001-01-06 14:59:58 +00003171 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003172 continue;
Marc-André Lemburgec233e52001-01-06 14:59:58 +00003173 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003174 *p++ = x;
3175 ++s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003176 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003177 }
3178 else {
3179 while (s < e) {
3180 unsigned char ch = *s;
3181 PyObject *w, *x;
3182
3183 /* Get mapping (char ordinal -> integer, Unicode char or None) */
3184 w = PyInt_FromLong((long)ch);
3185 if (w == NULL)
3186 goto onError;
3187 x = PyObject_GetItem(mapping, w);
3188 Py_DECREF(w);
3189 if (x == NULL) {
3190 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
3191 /* No mapping found means: mapping is undefined. */
3192 PyErr_Clear();
3193 x = Py_None;
3194 Py_INCREF(x);
3195 } else
3196 goto onError;
3197 }
3198
3199 /* Apply mapping */
3200 if (PyInt_Check(x)) {
3201 long value = PyInt_AS_LONG(x);
3202 if (value < 0 || value > 65535) {
3203 PyErr_SetString(PyExc_TypeError,
3204 "character mapping must be in range(65536)");
3205 Py_DECREF(x);
3206 goto onError;
3207 }
3208 *p++ = (Py_UNICODE)value;
3209 }
3210 else if (x == Py_None) {
3211 /* undefined mapping */
3212 outpos = p-PyUnicode_AS_UNICODE(v);
3213 startinpos = s-starts;
3214 endinpos = startinpos+1;
3215 if (unicode_decode_call_errorhandler(
3216 errors, &errorHandler,
3217 "charmap", "character maps to <undefined>",
3218 starts, size, &startinpos, &endinpos, &exc, &s,
3219 (PyObject **)&v, &outpos, &p)) {
3220 Py_DECREF(x);
3221 goto onError;
3222 }
Walter Dörwaldd4fff172005-11-28 22:15:56 +00003223 Py_DECREF(x);
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003224 continue;
3225 }
3226 else if (PyUnicode_Check(x)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00003227 Py_ssize_t targetsize = PyUnicode_GET_SIZE(x);
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003228
3229 if (targetsize == 1)
3230 /* 1-1 mapping */
3231 *p++ = *PyUnicode_AS_UNICODE(x);
3232
3233 else if (targetsize > 1) {
3234 /* 1-n mapping */
3235 if (targetsize > extrachars) {
3236 /* resize first */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003237 Py_ssize_t oldpos = p - PyUnicode_AS_UNICODE(v);
3238 Py_ssize_t needed = (targetsize - extrachars) + \
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003239 (targetsize << 2);
3240 extrachars += needed;
Thomas Wouters89f507f2006-12-13 04:49:30 +00003241 /* XXX overflow detection missing */
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003242 if (_PyUnicode_Resize(&v,
3243 PyUnicode_GET_SIZE(v) + needed) < 0) {
3244 Py_DECREF(x);
3245 goto onError;
3246 }
3247 p = PyUnicode_AS_UNICODE(v) + oldpos;
3248 }
3249 Py_UNICODE_COPY(p,
3250 PyUnicode_AS_UNICODE(x),
3251 targetsize);
3252 p += targetsize;
3253 extrachars -= targetsize;
3254 }
3255 /* 1-0 mapping: skip the character */
3256 }
3257 else {
3258 /* wrong return value */
3259 PyErr_SetString(PyExc_TypeError,
3260 "character mapping must return integer, None or unicode");
3261 Py_DECREF(x);
3262 goto onError;
3263 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003264 Py_DECREF(x);
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003265 ++s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003266 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003267 }
3268 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003269 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003270 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003271 Py_XDECREF(errorHandler);
3272 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003273 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00003274
Guido van Rossumd57fd912000-03-10 22:53:23 +00003275 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003276 Py_XDECREF(errorHandler);
3277 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003278 Py_XDECREF(v);
3279 return NULL;
3280}
3281
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003282/* Charmap encoding: the lookup table */
3283
3284struct encoding_map{
3285 PyObject_HEAD
3286 unsigned char level1[32];
3287 int count2, count3;
3288 unsigned char level23[1];
3289};
3290
3291static PyObject*
3292encoding_map_size(PyObject *obj, PyObject* args)
3293{
3294 struct encoding_map *map = (struct encoding_map*)obj;
3295 return PyInt_FromLong(sizeof(*map) - 1 + 16*map->count2 +
3296 128*map->count3);
3297}
3298
3299static PyMethodDef encoding_map_methods[] = {
3300 {"size", encoding_map_size, METH_NOARGS,
3301 PyDoc_STR("Return the size (in bytes) of this object") },
3302 { 0 }
3303};
3304
3305static void
3306encoding_map_dealloc(PyObject* o)
3307{
3308 PyObject_FREE(o);
3309}
3310
3311static PyTypeObject EncodingMapType = {
3312 PyObject_HEAD_INIT(NULL)
3313 0, /*ob_size*/
3314 "EncodingMap", /*tp_name*/
3315 sizeof(struct encoding_map), /*tp_basicsize*/
3316 0, /*tp_itemsize*/
3317 /* methods */
3318 encoding_map_dealloc, /*tp_dealloc*/
3319 0, /*tp_print*/
3320 0, /*tp_getattr*/
3321 0, /*tp_setattr*/
3322 0, /*tp_compare*/
3323 0, /*tp_repr*/
3324 0, /*tp_as_number*/
3325 0, /*tp_as_sequence*/
3326 0, /*tp_as_mapping*/
3327 0, /*tp_hash*/
3328 0, /*tp_call*/
3329 0, /*tp_str*/
3330 0, /*tp_getattro*/
3331 0, /*tp_setattro*/
3332 0, /*tp_as_buffer*/
3333 Py_TPFLAGS_DEFAULT, /*tp_flags*/
3334 0, /*tp_doc*/
3335 0, /*tp_traverse*/
3336 0, /*tp_clear*/
3337 0, /*tp_richcompare*/
3338 0, /*tp_weaklistoffset*/
3339 0, /*tp_iter*/
3340 0, /*tp_iternext*/
3341 encoding_map_methods, /*tp_methods*/
3342 0, /*tp_members*/
3343 0, /*tp_getset*/
3344 0, /*tp_base*/
3345 0, /*tp_dict*/
3346 0, /*tp_descr_get*/
3347 0, /*tp_descr_set*/
3348 0, /*tp_dictoffset*/
3349 0, /*tp_init*/
3350 0, /*tp_alloc*/
3351 0, /*tp_new*/
3352 0, /*tp_free*/
3353 0, /*tp_is_gc*/
3354};
3355
3356PyObject*
3357PyUnicode_BuildEncodingMap(PyObject* string)
3358{
3359 Py_UNICODE *decode;
3360 PyObject *result;
3361 struct encoding_map *mresult;
3362 int i;
3363 int need_dict = 0;
3364 unsigned char level1[32];
3365 unsigned char level2[512];
3366 unsigned char *mlevel1, *mlevel2, *mlevel3;
3367 int count2 = 0, count3 = 0;
3368
3369 if (!PyUnicode_Check(string) || PyUnicode_GetSize(string) != 256) {
3370 PyErr_BadArgument();
3371 return NULL;
3372 }
3373 decode = PyUnicode_AS_UNICODE(string);
3374 memset(level1, 0xFF, sizeof level1);
3375 memset(level2, 0xFF, sizeof level2);
3376
3377 /* If there isn't a one-to-one mapping of NULL to \0,
3378 or if there are non-BMP characters, we need to use
3379 a mapping dictionary. */
3380 if (decode[0] != 0)
3381 need_dict = 1;
3382 for (i = 1; i < 256; i++) {
3383 int l1, l2;
3384 if (decode[i] == 0
3385 #ifdef Py_UNICODE_WIDE
3386 || decode[i] > 0xFFFF
3387 #endif
3388 ) {
3389 need_dict = 1;
3390 break;
3391 }
3392 if (decode[i] == 0xFFFE)
3393 /* unmapped character */
3394 continue;
3395 l1 = decode[i] >> 11;
3396 l2 = decode[i] >> 7;
3397 if (level1[l1] == 0xFF)
3398 level1[l1] = count2++;
3399 if (level2[l2] == 0xFF)
3400 level2[l2] = count3++;
3401 }
3402
3403 if (count2 >= 0xFF || count3 >= 0xFF)
3404 need_dict = 1;
3405
3406 if (need_dict) {
3407 PyObject *result = PyDict_New();
3408 PyObject *key, *value;
3409 if (!result)
3410 return NULL;
3411 for (i = 0; i < 256; i++) {
3412 key = value = NULL;
3413 key = PyInt_FromLong(decode[i]);
3414 value = PyInt_FromLong(i);
3415 if (!key || !value)
3416 goto failed1;
3417 if (PyDict_SetItem(result, key, value) == -1)
3418 goto failed1;
3419 Py_DECREF(key);
3420 Py_DECREF(value);
3421 }
3422 return result;
3423 failed1:
3424 Py_XDECREF(key);
3425 Py_XDECREF(value);
3426 Py_DECREF(result);
3427 return NULL;
3428 }
3429
3430 /* Create a three-level trie */
3431 result = PyObject_MALLOC(sizeof(struct encoding_map) +
3432 16*count2 + 128*count3 - 1);
3433 if (!result)
3434 return PyErr_NoMemory();
3435 PyObject_Init(result, &EncodingMapType);
3436 mresult = (struct encoding_map*)result;
3437 mresult->count2 = count2;
3438 mresult->count3 = count3;
3439 mlevel1 = mresult->level1;
3440 mlevel2 = mresult->level23;
3441 mlevel3 = mresult->level23 + 16*count2;
3442 memcpy(mlevel1, level1, 32);
3443 memset(mlevel2, 0xFF, 16*count2);
3444 memset(mlevel3, 0, 128*count3);
3445 count3 = 0;
3446 for (i = 1; i < 256; i++) {
3447 int o1, o2, o3, i2, i3;
3448 if (decode[i] == 0xFFFE)
3449 /* unmapped character */
3450 continue;
3451 o1 = decode[i]>>11;
3452 o2 = (decode[i]>>7) & 0xF;
3453 i2 = 16*mlevel1[o1] + o2;
3454 if (mlevel2[i2] == 0xFF)
3455 mlevel2[i2] = count3++;
3456 o3 = decode[i] & 0x7F;
3457 i3 = 128*mlevel2[i2] + o3;
3458 mlevel3[i3] = i;
3459 }
3460 return result;
3461}
3462
3463static int
3464encoding_map_lookup(Py_UNICODE c, PyObject *mapping)
3465{
3466 struct encoding_map *map = (struct encoding_map*)mapping;
3467 int l1 = c>>11;
3468 int l2 = (c>>7) & 0xF;
3469 int l3 = c & 0x7F;
3470 int i;
3471
3472#ifdef Py_UNICODE_WIDE
3473 if (c > 0xFFFF) {
3474 return -1;
3475 }
3476#endif
3477 if (c == 0)
3478 return 0;
3479 /* level 1*/
3480 i = map->level1[l1];
3481 if (i == 0xFF) {
3482 return -1;
3483 }
3484 /* level 2*/
3485 i = map->level23[16*i+l2];
3486 if (i == 0xFF) {
3487 return -1;
3488 }
3489 /* level 3 */
3490 i = map->level23[16*map->count2 + 128*i + l3];
3491 if (i == 0) {
3492 return -1;
3493 }
3494 return i;
3495}
3496
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003497/* Lookup the character ch in the mapping. If the character
3498 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00003499 error occurred). */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003500static PyObject *charmapencode_lookup(Py_UNICODE c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003501{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003502 PyObject *w = PyInt_FromLong((long)c);
3503 PyObject *x;
3504
3505 if (w == NULL)
3506 return NULL;
3507 x = PyObject_GetItem(mapping, w);
3508 Py_DECREF(w);
3509 if (x == NULL) {
3510 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
3511 /* No mapping found means: mapping is undefined. */
3512 PyErr_Clear();
3513 x = Py_None;
3514 Py_INCREF(x);
3515 return x;
3516 } else
3517 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003518 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00003519 else if (x == Py_None)
3520 return x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003521 else if (PyInt_Check(x)) {
3522 long value = PyInt_AS_LONG(x);
3523 if (value < 0 || value > 255) {
3524 PyErr_SetString(PyExc_TypeError,
3525 "character mapping must be in range(256)");
3526 Py_DECREF(x);
3527 return NULL;
3528 }
3529 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003530 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003531 else if (PyString_Check(x))
3532 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003533 else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003534 /* wrong return value */
Walter Dörwald580ceed2007-05-09 10:39:19 +00003535 PyErr_Format(PyExc_TypeError,
3536 "character mapping must return integer, None or str8, not %.400s",
3537 x->ob_type->tp_name);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003538 Py_DECREF(x);
3539 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003540 }
3541}
3542
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003543static int
3544charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
3545{
3546 Py_ssize_t outsize = PyString_GET_SIZE(*outobj);
3547 /* exponentially overallocate to minimize reallocations */
3548 if (requiredsize < 2*outsize)
3549 requiredsize = 2*outsize;
3550 if (_PyString_Resize(outobj, requiredsize)) {
3551 return 0;
3552 }
3553 return 1;
3554}
3555
3556typedef enum charmapencode_result {
3557 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
3558}charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003559/* lookup the character, put the result in the output string and adjust
3560 various state variables. Reallocate the output string if not enough
3561 space is available. Return a new reference to the object that
3562 was put in the output buffer, or Py_None, if the mapping was undefined
3563 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00003564 reallocation error occurred. The caller must decref the result */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003565static
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003566charmapencode_result charmapencode_output(Py_UNICODE c, PyObject *mapping,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003567 PyObject **outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003568{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003569 PyObject *rep;
3570 char *outstart;
3571 Py_ssize_t outsize = PyString_GET_SIZE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003572
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003573 if (mapping->ob_type == &EncodingMapType) {
3574 int res = encoding_map_lookup(c, mapping);
3575 Py_ssize_t requiredsize = *outpos+1;
3576 if (res == -1)
3577 return enc_FAILED;
3578 if (outsize<requiredsize)
3579 if (!charmapencode_resize(outobj, outpos, requiredsize))
3580 return enc_EXCEPTION;
3581 outstart = PyString_AS_STRING(*outobj);
3582 outstart[(*outpos)++] = (char)res;
3583 return enc_SUCCESS;
3584 }
3585
3586 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003587 if (rep==NULL)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003588 return enc_EXCEPTION;
3589 else if (rep==Py_None) {
3590 Py_DECREF(rep);
3591 return enc_FAILED;
3592 } else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003593 if (PyInt_Check(rep)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00003594 Py_ssize_t requiredsize = *outpos+1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003595 if (outsize<requiredsize)
3596 if (!charmapencode_resize(outobj, outpos, requiredsize)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003597 Py_DECREF(rep);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003598 return enc_EXCEPTION;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003599 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003600 outstart = PyString_AS_STRING(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003601 outstart[(*outpos)++] = (char)PyInt_AS_LONG(rep);
3602 }
3603 else {
3604 const char *repchars = PyString_AS_STRING(rep);
Martin v. Löwis18e16552006-02-15 17:27:45 +00003605 Py_ssize_t repsize = PyString_GET_SIZE(rep);
3606 Py_ssize_t requiredsize = *outpos+repsize;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003607 if (outsize<requiredsize)
3608 if (!charmapencode_resize(outobj, outpos, requiredsize)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003609 Py_DECREF(rep);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003610 return enc_EXCEPTION;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003611 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003612 outstart = PyString_AS_STRING(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003613 memcpy(outstart + *outpos, repchars, repsize);
3614 *outpos += repsize;
3615 }
3616 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003617 Py_DECREF(rep);
3618 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003619}
3620
3621/* handle an error in PyUnicode_EncodeCharmap
3622 Return 0 on success, -1 on error */
3623static
3624int charmap_encoding_error(
Martin v. Löwis18e16552006-02-15 17:27:45 +00003625 const Py_UNICODE *p, Py_ssize_t size, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003626 PyObject **exceptionObject,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00003627 int *known_errorHandler, PyObject **errorHandler, const char *errors,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003628 PyObject **res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003629{
3630 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003631 Py_ssize_t repsize;
3632 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003633 Py_UNICODE *uni2;
3634 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003635 Py_ssize_t collstartpos = *inpos;
3636 Py_ssize_t collendpos = *inpos+1;
3637 Py_ssize_t collpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003638 char *encoding = "charmap";
3639 char *reason = "character maps to <undefined>";
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003640 charmapencode_result x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003641
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003642 /* find all unencodable characters */
3643 while (collendpos < size) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003644 PyObject *rep;
3645 if (mapping->ob_type == &EncodingMapType) {
3646 int res = encoding_map_lookup(p[collendpos], mapping);
3647 if (res != -1)
3648 break;
3649 ++collendpos;
3650 continue;
3651 }
3652
3653 rep = charmapencode_lookup(p[collendpos], mapping);
3654 if (rep==NULL)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003655 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003656 else if (rep!=Py_None) {
3657 Py_DECREF(rep);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003658 break;
3659 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003660 Py_DECREF(rep);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003661 ++collendpos;
3662 }
3663 /* cache callback name lookup
3664 * (if not done yet, i.e. it's the first error) */
3665 if (*known_errorHandler==-1) {
3666 if ((errors==NULL) || (!strcmp(errors, "strict")))
3667 *known_errorHandler = 1;
3668 else if (!strcmp(errors, "replace"))
3669 *known_errorHandler = 2;
3670 else if (!strcmp(errors, "ignore"))
3671 *known_errorHandler = 3;
3672 else if (!strcmp(errors, "xmlcharrefreplace"))
3673 *known_errorHandler = 4;
3674 else
3675 *known_errorHandler = 0;
3676 }
3677 switch (*known_errorHandler) {
3678 case 1: /* strict */
3679 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
3680 return -1;
3681 case 2: /* replace */
3682 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
3683 x = charmapencode_output('?', mapping, res, respos);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003684 if (x==enc_EXCEPTION) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003685 return -1;
3686 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003687 else if (x==enc_FAILED) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003688 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
3689 return -1;
3690 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003691 }
3692 /* fall through */
3693 case 3: /* ignore */
3694 *inpos = collendpos;
3695 break;
3696 case 4: /* xmlcharrefreplace */
3697 /* generate replacement (temporarily (mis)uses p) */
3698 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
3699 char buffer[2+29+1+1];
3700 char *cp;
3701 sprintf(buffer, "&#%d;", (int)p[collpos]);
3702 for (cp = buffer; *cp; ++cp) {
3703 x = charmapencode_output(*cp, mapping, res, respos);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003704 if (x==enc_EXCEPTION)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003705 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003706 else if (x==enc_FAILED) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003707 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
3708 return -1;
3709 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003710 }
3711 }
3712 *inpos = collendpos;
3713 break;
3714 default:
Walter Dörwalde5402fb2003-08-14 20:25:29 +00003715 repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003716 encoding, reason, p, size, exceptionObject,
3717 collstartpos, collendpos, &newpos);
3718 if (repunicode == NULL)
3719 return -1;
3720 /* generate replacement */
3721 repsize = PyUnicode_GET_SIZE(repunicode);
3722 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
3723 x = charmapencode_output(*uni2, mapping, res, respos);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003724 if (x==enc_EXCEPTION) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003725 return -1;
3726 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003727 else if (x==enc_FAILED) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003728 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003729 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
3730 return -1;
3731 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003732 }
3733 *inpos = newpos;
3734 Py_DECREF(repunicode);
3735 }
3736 return 0;
3737}
3738
Guido van Rossumd57fd912000-03-10 22:53:23 +00003739PyObject *PyUnicode_EncodeCharmap(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003740 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003741 PyObject *mapping,
3742 const char *errors)
3743{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003744 /* output object */
3745 PyObject *res = NULL;
3746 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003747 Py_ssize_t inpos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003748 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003749 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003750 PyObject *errorHandler = NULL;
3751 PyObject *exc = NULL;
3752 /* the following variable is used for caching string comparisons
3753 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
3754 * 3=ignore, 4=xmlcharrefreplace */
3755 int known_errorHandler = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003756
3757 /* Default to Latin-1 */
3758 if (mapping == NULL)
3759 return PyUnicode_EncodeLatin1(p, size, errors);
3760
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003761 /* allocate enough for a simple encoding without
3762 replacements, if we need more, we'll resize */
3763 res = PyString_FromStringAndSize(NULL, size);
3764 if (res == NULL)
3765 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00003766 if (size == 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003767 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003768
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003769 while (inpos<size) {
3770 /* try to encode it */
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003771 charmapencode_result x = charmapencode_output(p[inpos], mapping, &res, &respos);
3772 if (x==enc_EXCEPTION) /* error */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003773 goto onError;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003774 if (x==enc_FAILED) { /* unencodable character */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003775 if (charmap_encoding_error(p, size, &inpos, mapping,
3776 &exc,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00003777 &known_errorHandler, &errorHandler, errors,
Walter Dörwald9b30f202003-08-15 16:26:34 +00003778 &res, &respos)) {
Marc-André Lemburg3a645e42001-01-16 11:54:12 +00003779 goto onError;
Walter Dörwald9b30f202003-08-15 16:26:34 +00003780 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003781 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003782 else
3783 /* done with this character => adjust input position */
3784 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003785 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003786
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003787 /* Resize if we allocated to much */
3788 if (respos<PyString_GET_SIZE(res)) {
3789 if (_PyString_Resize(&res, respos))
3790 goto onError;
3791 }
3792 Py_XDECREF(exc);
3793 Py_XDECREF(errorHandler);
3794 return res;
3795
3796 onError:
3797 Py_XDECREF(res);
3798 Py_XDECREF(exc);
3799 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003800 return NULL;
3801}
3802
3803PyObject *PyUnicode_AsCharmapString(PyObject *unicode,
3804 PyObject *mapping)
3805{
3806 if (!PyUnicode_Check(unicode) || mapping == NULL) {
3807 PyErr_BadArgument();
3808 return NULL;
3809 }
3810 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
3811 PyUnicode_GET_SIZE(unicode),
3812 mapping,
3813 NULL);
3814}
3815
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003816/* create or adjust a UnicodeTranslateError */
3817static void make_translate_exception(PyObject **exceptionObject,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003818 const Py_UNICODE *unicode, Py_ssize_t size,
3819 Py_ssize_t startpos, Py_ssize_t endpos,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003820 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003821{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003822 if (*exceptionObject == NULL) {
3823 *exceptionObject = PyUnicodeTranslateError_Create(
3824 unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003825 }
3826 else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003827 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
3828 goto onError;
3829 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
3830 goto onError;
3831 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
3832 goto onError;
3833 return;
3834 onError:
3835 Py_DECREF(*exceptionObject);
3836 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003837 }
3838}
3839
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003840/* raises a UnicodeTranslateError */
3841static void raise_translate_exception(PyObject **exceptionObject,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003842 const Py_UNICODE *unicode, Py_ssize_t size,
3843 Py_ssize_t startpos, Py_ssize_t endpos,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003844 const char *reason)
3845{
3846 make_translate_exception(exceptionObject,
3847 unicode, size, startpos, endpos, reason);
3848 if (*exceptionObject != NULL)
3849 PyCodec_StrictErrors(*exceptionObject);
3850}
3851
3852/* error handling callback helper:
3853 build arguments, call the callback and check the arguments,
3854 put the result into newpos and return the replacement string, which
3855 has to be freed by the caller */
3856static PyObject *unicode_translate_call_errorhandler(const char *errors,
3857 PyObject **errorHandler,
3858 const char *reason,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003859 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
3860 Py_ssize_t startpos, Py_ssize_t endpos,
3861 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003862{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003863 static char *argparse = "O!n;translating error handler must return (unicode, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003864
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003865 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003866 PyObject *restuple;
3867 PyObject *resunicode;
3868
3869 if (*errorHandler == NULL) {
3870 *errorHandler = PyCodec_LookupError(errors);
3871 if (*errorHandler == NULL)
3872 return NULL;
3873 }
3874
3875 make_translate_exception(exceptionObject,
3876 unicode, size, startpos, endpos, reason);
3877 if (*exceptionObject == NULL)
3878 return NULL;
3879
3880 restuple = PyObject_CallFunctionObjArgs(
3881 *errorHandler, *exceptionObject, NULL);
3882 if (restuple == NULL)
3883 return NULL;
3884 if (!PyTuple_Check(restuple)) {
3885 PyErr_Format(PyExc_TypeError, &argparse[4]);
3886 Py_DECREF(restuple);
3887 return NULL;
3888 }
3889 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003890 &resunicode, &i_newpos)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003891 Py_DECREF(restuple);
3892 return NULL;
3893 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00003894 if (i_newpos<0)
3895 *newpos = size+i_newpos;
3896 else
3897 *newpos = i_newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003898 if (*newpos<0 || *newpos>size) {
Martin v. Löwis2c95cc62006-02-16 06:54:25 +00003899 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003900 Py_DECREF(restuple);
3901 return NULL;
3902 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003903 Py_INCREF(resunicode);
3904 Py_DECREF(restuple);
3905 return resunicode;
3906}
3907
3908/* Lookup the character ch in the mapping and put the result in result,
3909 which must be decrefed by the caller.
3910 Return 0 on success, -1 on error */
3911static
3912int charmaptranslate_lookup(Py_UNICODE c, PyObject *mapping, PyObject **result)
3913{
3914 PyObject *w = PyInt_FromLong((long)c);
3915 PyObject *x;
3916
3917 if (w == NULL)
3918 return -1;
3919 x = PyObject_GetItem(mapping, w);
3920 Py_DECREF(w);
3921 if (x == NULL) {
3922 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
3923 /* No mapping found means: use 1:1 mapping. */
3924 PyErr_Clear();
3925 *result = NULL;
3926 return 0;
3927 } else
3928 return -1;
3929 }
3930 else if (x == Py_None) {
3931 *result = x;
3932 return 0;
3933 }
3934 else if (PyInt_Check(x)) {
3935 long value = PyInt_AS_LONG(x);
3936 long max = PyUnicode_GetMax();
3937 if (value < 0 || value > max) {
3938 PyErr_Format(PyExc_TypeError,
3939 "character mapping must be in range(0x%lx)", max+1);
3940 Py_DECREF(x);
3941 return -1;
3942 }
3943 *result = x;
3944 return 0;
3945 }
3946 else if (PyUnicode_Check(x)) {
3947 *result = x;
3948 return 0;
3949 }
3950 else {
3951 /* wrong return value */
3952 PyErr_SetString(PyExc_TypeError,
3953 "character mapping must return integer, None or unicode");
Walter Dörwald150523e2003-08-15 16:52:19 +00003954 Py_DECREF(x);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003955 return -1;
3956 }
3957}
3958/* ensure that *outobj is at least requiredsize characters long,
3959if not reallocate and adjust various state variables.
3960Return 0 on success, -1 on error */
3961static
Walter Dörwald4894c302003-10-24 14:25:28 +00003962int charmaptranslate_makespace(PyObject **outobj, Py_UNICODE **outp,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003963 Py_ssize_t requiredsize)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003964{
Martin v. Löwis18e16552006-02-15 17:27:45 +00003965 Py_ssize_t oldsize = PyUnicode_GET_SIZE(*outobj);
Walter Dörwald4894c302003-10-24 14:25:28 +00003966 if (requiredsize > oldsize) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003967 /* remember old output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003968 Py_ssize_t outpos = *outp-PyUnicode_AS_UNICODE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003969 /* exponentially overallocate to minimize reallocations */
Walter Dörwald4894c302003-10-24 14:25:28 +00003970 if (requiredsize < 2 * oldsize)
3971 requiredsize = 2 * oldsize;
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00003972 if (_PyUnicode_Resize(outobj, requiredsize) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003973 return -1;
3974 *outp = PyUnicode_AS_UNICODE(*outobj) + outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003975 }
3976 return 0;
3977}
3978/* lookup the character, put the result in the output string and adjust
3979 various state variables. Return a new reference to the object that
3980 was put in the output buffer in *result, or Py_None, if the mapping was
3981 undefined (in which case no character was written).
3982 The called must decref result.
3983 Return 0 on success, -1 on error. */
3984static
Walter Dörwald4894c302003-10-24 14:25:28 +00003985int charmaptranslate_output(const Py_UNICODE *startinp, const Py_UNICODE *curinp,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003986 Py_ssize_t insize, PyObject *mapping, PyObject **outobj, Py_UNICODE **outp,
Walter Dörwald4894c302003-10-24 14:25:28 +00003987 PyObject **res)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003988{
Walter Dörwald4894c302003-10-24 14:25:28 +00003989 if (charmaptranslate_lookup(*curinp, mapping, res))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003990 return -1;
3991 if (*res==NULL) {
3992 /* not found => default to 1:1 mapping */
Walter Dörwald4894c302003-10-24 14:25:28 +00003993 *(*outp)++ = *curinp;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003994 }
3995 else if (*res==Py_None)
3996 ;
3997 else if (PyInt_Check(*res)) {
3998 /* no overflow check, because we know that the space is enough */
3999 *(*outp)++ = (Py_UNICODE)PyInt_AS_LONG(*res);
4000 }
4001 else if (PyUnicode_Check(*res)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00004002 Py_ssize_t repsize = PyUnicode_GET_SIZE(*res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004003 if (repsize==1) {
4004 /* no overflow check, because we know that the space is enough */
4005 *(*outp)++ = *PyUnicode_AS_UNICODE(*res);
4006 }
4007 else if (repsize!=0) {
4008 /* more than one character */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004009 Py_ssize_t requiredsize = (*outp-PyUnicode_AS_UNICODE(*outobj)) +
Walter Dörwaldcd736e72004-02-05 17:36:00 +00004010 (insize - (curinp-startinp)) +
Walter Dörwald4894c302003-10-24 14:25:28 +00004011 repsize - 1;
4012 if (charmaptranslate_makespace(outobj, outp, requiredsize))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004013 return -1;
4014 memcpy(*outp, PyUnicode_AS_UNICODE(*res), sizeof(Py_UNICODE)*repsize);
4015 *outp += repsize;
4016 }
4017 }
4018 else
4019 return -1;
4020 return 0;
4021}
4022
4023PyObject *PyUnicode_TranslateCharmap(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004024 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004025 PyObject *mapping,
4026 const char *errors)
4027{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004028 /* output object */
4029 PyObject *res = NULL;
4030 /* pointers to the beginning and end+1 of input */
4031 const Py_UNICODE *startp = p;
4032 const Py_UNICODE *endp = p + size;
4033 /* pointer into the output */
4034 Py_UNICODE *str;
4035 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004036 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004037 char *reason = "character maps to <undefined>";
4038 PyObject *errorHandler = NULL;
4039 PyObject *exc = NULL;
4040 /* the following variable is used for caching string comparisons
4041 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
4042 * 3=ignore, 4=xmlcharrefreplace */
4043 int known_errorHandler = -1;
4044
Guido van Rossumd57fd912000-03-10 22:53:23 +00004045 if (mapping == NULL) {
4046 PyErr_BadArgument();
4047 return NULL;
4048 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004049
4050 /* allocate enough for a simple 1:1 translation without
4051 replacements, if we need more, we'll resize */
4052 res = PyUnicode_FromUnicode(NULL, size);
4053 if (res == NULL)
Walter Dörwald4894c302003-10-24 14:25:28 +00004054 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004055 if (size == 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004056 return res;
4057 str = PyUnicode_AS_UNICODE(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004058
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004059 while (p<endp) {
4060 /* try to encode it */
4061 PyObject *x = NULL;
Walter Dörwald4894c302003-10-24 14:25:28 +00004062 if (charmaptranslate_output(startp, p, size, mapping, &res, &str, &x)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004063 Py_XDECREF(x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004064 goto onError;
4065 }
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00004066 Py_XDECREF(x);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004067 if (x!=Py_None) /* it worked => adjust input pointer */
4068 ++p;
4069 else { /* untranslatable character */
4070 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004071 Py_ssize_t repsize;
4072 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004073 Py_UNICODE *uni2;
4074 /* startpos for collecting untranslatable chars */
4075 const Py_UNICODE *collstart = p;
4076 const Py_UNICODE *collend = p+1;
4077 const Py_UNICODE *coll;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004078
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004079 /* find all untranslatable characters */
4080 while (collend < endp) {
Walter Dörwald4894c302003-10-24 14:25:28 +00004081 if (charmaptranslate_lookup(*collend, mapping, &x))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004082 goto onError;
4083 Py_XDECREF(x);
4084 if (x!=Py_None)
4085 break;
4086 ++collend;
4087 }
4088 /* cache callback name lookup
4089 * (if not done yet, i.e. it's the first error) */
4090 if (known_errorHandler==-1) {
4091 if ((errors==NULL) || (!strcmp(errors, "strict")))
4092 known_errorHandler = 1;
4093 else if (!strcmp(errors, "replace"))
4094 known_errorHandler = 2;
4095 else if (!strcmp(errors, "ignore"))
4096 known_errorHandler = 3;
4097 else if (!strcmp(errors, "xmlcharrefreplace"))
4098 known_errorHandler = 4;
4099 else
4100 known_errorHandler = 0;
4101 }
4102 switch (known_errorHandler) {
4103 case 1: /* strict */
4104 raise_translate_exception(&exc, startp, size, collstart-startp, collend-startp, reason);
4105 goto onError;
4106 case 2: /* replace */
4107 /* No need to check for space, this is a 1:1 replacement */
4108 for (coll = collstart; coll<collend; ++coll)
4109 *str++ = '?';
4110 /* fall through */
4111 case 3: /* ignore */
4112 p = collend;
4113 break;
4114 case 4: /* xmlcharrefreplace */
4115 /* generate replacement (temporarily (mis)uses p) */
4116 for (p = collstart; p < collend; ++p) {
4117 char buffer[2+29+1+1];
4118 char *cp;
4119 sprintf(buffer, "&#%d;", (int)*p);
Walter Dörwald4894c302003-10-24 14:25:28 +00004120 if (charmaptranslate_makespace(&res, &str,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004121 (str-PyUnicode_AS_UNICODE(res))+strlen(buffer)+(endp-collend)))
4122 goto onError;
4123 for (cp = buffer; *cp; ++cp)
4124 *str++ = *cp;
4125 }
4126 p = collend;
4127 break;
4128 default:
4129 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
4130 reason, startp, size, &exc,
4131 collstart-startp, collend-startp, &newpos);
4132 if (repunicode == NULL)
4133 goto onError;
4134 /* generate replacement */
4135 repsize = PyUnicode_GET_SIZE(repunicode);
Walter Dörwald4894c302003-10-24 14:25:28 +00004136 if (charmaptranslate_makespace(&res, &str,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004137 (str-PyUnicode_AS_UNICODE(res))+repsize+(endp-collend))) {
4138 Py_DECREF(repunicode);
4139 goto onError;
4140 }
4141 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2)
4142 *str++ = *uni2;
4143 p = startp + newpos;
4144 Py_DECREF(repunicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004145 }
4146 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004147 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004148 /* Resize if we allocated to much */
4149 respos = str-PyUnicode_AS_UNICODE(res);
Walter Dörwald4894c302003-10-24 14:25:28 +00004150 if (respos<PyUnicode_GET_SIZE(res)) {
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00004151 if (_PyUnicode_Resize(&res, respos) < 0)
Guido van Rossumfd4b9572000-04-10 13:51:10 +00004152 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004153 }
4154 Py_XDECREF(exc);
4155 Py_XDECREF(errorHandler);
4156 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004157
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004158 onError:
4159 Py_XDECREF(res);
4160 Py_XDECREF(exc);
4161 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004162 return NULL;
4163}
4164
4165PyObject *PyUnicode_Translate(PyObject *str,
4166 PyObject *mapping,
4167 const char *errors)
4168{
4169 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00004170
Guido van Rossumd57fd912000-03-10 22:53:23 +00004171 str = PyUnicode_FromObject(str);
4172 if (str == NULL)
4173 goto onError;
4174 result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str),
4175 PyUnicode_GET_SIZE(str),
4176 mapping,
4177 errors);
4178 Py_DECREF(str);
4179 return result;
Tim Petersced69f82003-09-16 20:30:58 +00004180
Guido van Rossumd57fd912000-03-10 22:53:23 +00004181 onError:
4182 Py_XDECREF(str);
4183 return NULL;
4184}
Tim Petersced69f82003-09-16 20:30:58 +00004185
Guido van Rossum9e896b32000-04-05 20:11:21 +00004186/* --- Decimal Encoder ---------------------------------------------------- */
4187
4188int PyUnicode_EncodeDecimal(Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004189 Py_ssize_t length,
Guido van Rossum9e896b32000-04-05 20:11:21 +00004190 char *output,
4191 const char *errors)
4192{
4193 Py_UNICODE *p, *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004194 PyObject *errorHandler = NULL;
4195 PyObject *exc = NULL;
4196 const char *encoding = "decimal";
4197 const char *reason = "invalid decimal Unicode string";
4198 /* the following variable is used for caching string comparisons
4199 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
4200 int known_errorHandler = -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00004201
4202 if (output == NULL) {
4203 PyErr_BadArgument();
4204 return -1;
4205 }
4206
4207 p = s;
4208 end = s + length;
4209 while (p < end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004210 register Py_UNICODE ch = *p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00004211 int decimal;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004212 PyObject *repunicode;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004213 Py_ssize_t repsize;
4214 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004215 Py_UNICODE *uni2;
4216 Py_UNICODE *collstart;
4217 Py_UNICODE *collend;
Tim Petersced69f82003-09-16 20:30:58 +00004218
Guido van Rossum9e896b32000-04-05 20:11:21 +00004219 if (Py_UNICODE_ISSPACE(ch)) {
4220 *output++ = ' ';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004221 ++p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00004222 continue;
4223 }
4224 decimal = Py_UNICODE_TODECIMAL(ch);
4225 if (decimal >= 0) {
4226 *output++ = '0' + decimal;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004227 ++p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00004228 continue;
4229 }
Guido van Rossumba477042000-04-06 18:18:10 +00004230 if (0 < ch && ch < 256) {
Guido van Rossum42c29aa2000-05-03 23:58:29 +00004231 *output++ = (char)ch;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004232 ++p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00004233 continue;
4234 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004235 /* All other characters are considered unencodable */
4236 collstart = p;
4237 collend = p+1;
4238 while (collend < end) {
4239 if ((0 < *collend && *collend < 256) ||
4240 !Py_UNICODE_ISSPACE(*collend) ||
4241 Py_UNICODE_TODECIMAL(*collend))
4242 break;
Guido van Rossum9e896b32000-04-05 20:11:21 +00004243 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004244 /* cache callback name lookup
4245 * (if not done yet, i.e. it's the first error) */
4246 if (known_errorHandler==-1) {
4247 if ((errors==NULL) || (!strcmp(errors, "strict")))
4248 known_errorHandler = 1;
4249 else if (!strcmp(errors, "replace"))
4250 known_errorHandler = 2;
4251 else if (!strcmp(errors, "ignore"))
4252 known_errorHandler = 3;
4253 else if (!strcmp(errors, "xmlcharrefreplace"))
4254 known_errorHandler = 4;
4255 else
4256 known_errorHandler = 0;
4257 }
4258 switch (known_errorHandler) {
4259 case 1: /* strict */
4260 raise_encode_exception(&exc, encoding, s, length, collstart-s, collend-s, reason);
4261 goto onError;
4262 case 2: /* replace */
4263 for (p = collstart; p < collend; ++p)
4264 *output++ = '?';
4265 /* fall through */
4266 case 3: /* ignore */
4267 p = collend;
4268 break;
4269 case 4: /* xmlcharrefreplace */
4270 /* generate replacement (temporarily (mis)uses p) */
4271 for (p = collstart; p < collend; ++p)
4272 output += sprintf(output, "&#%d;", (int)*p);
4273 p = collend;
4274 break;
4275 default:
4276 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
4277 encoding, reason, s, length, &exc,
4278 collstart-s, collend-s, &newpos);
4279 if (repunicode == NULL)
4280 goto onError;
4281 /* generate replacement */
4282 repsize = PyUnicode_GET_SIZE(repunicode);
4283 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
4284 Py_UNICODE ch = *uni2;
4285 if (Py_UNICODE_ISSPACE(ch))
4286 *output++ = ' ';
4287 else {
4288 decimal = Py_UNICODE_TODECIMAL(ch);
4289 if (decimal >= 0)
4290 *output++ = '0' + decimal;
4291 else if (0 < ch && ch < 256)
4292 *output++ = (char)ch;
4293 else {
4294 Py_DECREF(repunicode);
4295 raise_encode_exception(&exc, encoding,
4296 s, length, collstart-s, collend-s, reason);
4297 goto onError;
4298 }
4299 }
4300 }
4301 p = s + newpos;
4302 Py_DECREF(repunicode);
Guido van Rossum9e896b32000-04-05 20:11:21 +00004303 }
4304 }
4305 /* 0-terminate the output string */
4306 *output++ = '\0';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004307 Py_XDECREF(exc);
4308 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00004309 return 0;
4310
4311 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004312 Py_XDECREF(exc);
4313 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00004314 return -1;
4315}
4316
Guido van Rossumd57fd912000-03-10 22:53:23 +00004317/* --- Helpers ------------------------------------------------------------ */
4318
Thomas Wouters477c8d52006-05-27 19:21:47 +00004319#define STRINGLIB_CHAR Py_UNICODE
4320
4321#define STRINGLIB_LEN PyUnicode_GET_SIZE
4322#define STRINGLIB_NEW PyUnicode_FromUnicode
4323#define STRINGLIB_STR PyUnicode_AS_UNICODE
4324
4325Py_LOCAL_INLINE(int)
4326STRINGLIB_CMP(const Py_UNICODE* str, const Py_UNICODE* other, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004327{
Thomas Wouters477c8d52006-05-27 19:21:47 +00004328 if (str[0] != other[0])
4329 return 1;
4330 return memcmp((void*) str, (void*) other, len * sizeof(Py_UNICODE));
Guido van Rossumd57fd912000-03-10 22:53:23 +00004331}
4332
Thomas Wouters477c8d52006-05-27 19:21:47 +00004333#define STRINGLIB_EMPTY unicode_empty
4334
4335#include "stringlib/fastsearch.h"
4336
4337#include "stringlib/count.h"
4338#include "stringlib/find.h"
4339#include "stringlib/partition.h"
4340
4341/* helper macro to fixup start/end slice values */
4342#define FIX_START_END(obj) \
4343 if (start < 0) \
4344 start += (obj)->length; \
4345 if (start < 0) \
4346 start = 0; \
4347 if (end > (obj)->length) \
4348 end = (obj)->length; \
4349 if (end < 0) \
4350 end += (obj)->length; \
4351 if (end < 0) \
4352 end = 0;
4353
Martin v. Löwis18e16552006-02-15 17:27:45 +00004354Py_ssize_t PyUnicode_Count(PyObject *str,
Thomas Wouters477c8d52006-05-27 19:21:47 +00004355 PyObject *substr,
4356 Py_ssize_t start,
4357 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004358{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004359 Py_ssize_t result;
Thomas Wouters477c8d52006-05-27 19:21:47 +00004360 PyUnicodeObject* str_obj;
4361 PyUnicodeObject* sub_obj;
Tim Petersced69f82003-09-16 20:30:58 +00004362
Thomas Wouters477c8d52006-05-27 19:21:47 +00004363 str_obj = (PyUnicodeObject*) PyUnicode_FromObject(str);
4364 if (!str_obj)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004365 return -1;
Thomas Wouters477c8d52006-05-27 19:21:47 +00004366 sub_obj = (PyUnicodeObject*) PyUnicode_FromObject(substr);
4367 if (!sub_obj) {
4368 Py_DECREF(str_obj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004369 return -1;
4370 }
Tim Petersced69f82003-09-16 20:30:58 +00004371
Thomas Wouters477c8d52006-05-27 19:21:47 +00004372 FIX_START_END(str_obj);
Tim Petersced69f82003-09-16 20:30:58 +00004373
Thomas Wouters477c8d52006-05-27 19:21:47 +00004374 result = stringlib_count(
4375 str_obj->str + start, end - start, sub_obj->str, sub_obj->length
4376 );
4377
4378 Py_DECREF(sub_obj);
4379 Py_DECREF(str_obj);
4380
Guido van Rossumd57fd912000-03-10 22:53:23 +00004381 return result;
4382}
4383
Martin v. Löwis18e16552006-02-15 17:27:45 +00004384Py_ssize_t PyUnicode_Find(PyObject *str,
Thomas Wouters477c8d52006-05-27 19:21:47 +00004385 PyObject *sub,
4386 Py_ssize_t start,
4387 Py_ssize_t end,
4388 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004389{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004390 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00004391
Guido van Rossumd57fd912000-03-10 22:53:23 +00004392 str = PyUnicode_FromObject(str);
Thomas Wouters477c8d52006-05-27 19:21:47 +00004393 if (!str)
Marc-André Lemburg4da6fd62002-05-29 11:33:13 +00004394 return -2;
Thomas Wouters477c8d52006-05-27 19:21:47 +00004395 sub = PyUnicode_FromObject(sub);
4396 if (!sub) {
Marc-André Lemburg41644392002-05-29 13:46:29 +00004397 Py_DECREF(str);
Marc-André Lemburg4da6fd62002-05-29 11:33:13 +00004398 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004399 }
Tim Petersced69f82003-09-16 20:30:58 +00004400
Thomas Wouters477c8d52006-05-27 19:21:47 +00004401 if (direction > 0)
4402 result = stringlib_find_slice(
4403 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
4404 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
4405 start, end
4406 );
4407 else
4408 result = stringlib_rfind_slice(
4409 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
4410 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
4411 start, end
4412 );
4413
Guido van Rossumd57fd912000-03-10 22:53:23 +00004414 Py_DECREF(str);
Thomas Wouters477c8d52006-05-27 19:21:47 +00004415 Py_DECREF(sub);
4416
Guido van Rossumd57fd912000-03-10 22:53:23 +00004417 return result;
4418}
4419
Tim Petersced69f82003-09-16 20:30:58 +00004420static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004421int tailmatch(PyUnicodeObject *self,
4422 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004423 Py_ssize_t start,
4424 Py_ssize_t end,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004425 int direction)
4426{
Guido van Rossumd57fd912000-03-10 22:53:23 +00004427 if (substring->length == 0)
4428 return 1;
4429
Thomas Wouters477c8d52006-05-27 19:21:47 +00004430 FIX_START_END(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004431
4432 end -= substring->length;
4433 if (end < start)
4434 return 0;
4435
4436 if (direction > 0) {
4437 if (Py_UNICODE_MATCH(self, end, substring))
4438 return 1;
4439 } else {
4440 if (Py_UNICODE_MATCH(self, start, substring))
4441 return 1;
4442 }
4443
4444 return 0;
4445}
4446
Martin v. Löwis18e16552006-02-15 17:27:45 +00004447Py_ssize_t PyUnicode_Tailmatch(PyObject *str,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004448 PyObject *substr,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004449 Py_ssize_t start,
4450 Py_ssize_t end,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004451 int direction)
4452{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004453 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00004454
Guido van Rossumd57fd912000-03-10 22:53:23 +00004455 str = PyUnicode_FromObject(str);
4456 if (str == NULL)
4457 return -1;
4458 substr = PyUnicode_FromObject(substr);
4459 if (substr == NULL) {
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00004460 Py_DECREF(str);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004461 return -1;
4462 }
Tim Petersced69f82003-09-16 20:30:58 +00004463
Guido van Rossumd57fd912000-03-10 22:53:23 +00004464 result = tailmatch((PyUnicodeObject *)str,
4465 (PyUnicodeObject *)substr,
4466 start, end, direction);
4467 Py_DECREF(str);
4468 Py_DECREF(substr);
4469 return result;
4470}
4471
Guido van Rossumd57fd912000-03-10 22:53:23 +00004472/* Apply fixfct filter to the Unicode object self and return a
4473 reference to the modified object */
4474
Tim Petersced69f82003-09-16 20:30:58 +00004475static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004476PyObject *fixup(PyUnicodeObject *self,
4477 int (*fixfct)(PyUnicodeObject *s))
4478{
4479
4480 PyUnicodeObject *u;
4481
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004482 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004483 if (u == NULL)
4484 return NULL;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004485
4486 Py_UNICODE_COPY(u->str, self->str, self->length);
4487
Tim Peters7a29bd52001-09-12 03:03:31 +00004488 if (!fixfct(u) && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004489 /* fixfct should return TRUE if it modified the buffer. If
4490 FALSE, return a reference to the original buffer instead
4491 (to save space, not time) */
4492 Py_INCREF(self);
4493 Py_DECREF(u);
4494 return (PyObject*) self;
4495 }
4496 return (PyObject*) u;
4497}
4498
Tim Petersced69f82003-09-16 20:30:58 +00004499static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004500int fixupper(PyUnicodeObject *self)
4501{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004502 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004503 Py_UNICODE *s = self->str;
4504 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00004505
Guido van Rossumd57fd912000-03-10 22:53:23 +00004506 while (len-- > 0) {
4507 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00004508
Guido van Rossumd57fd912000-03-10 22:53:23 +00004509 ch = Py_UNICODE_TOUPPER(*s);
4510 if (ch != *s) {
4511 status = 1;
4512 *s = ch;
4513 }
4514 s++;
4515 }
4516
4517 return status;
4518}
4519
Tim Petersced69f82003-09-16 20:30:58 +00004520static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004521int fixlower(PyUnicodeObject *self)
4522{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004523 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004524 Py_UNICODE *s = self->str;
4525 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00004526
Guido van Rossumd57fd912000-03-10 22:53:23 +00004527 while (len-- > 0) {
4528 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00004529
Guido van Rossumd57fd912000-03-10 22:53:23 +00004530 ch = Py_UNICODE_TOLOWER(*s);
4531 if (ch != *s) {
4532 status = 1;
4533 *s = ch;
4534 }
4535 s++;
4536 }
4537
4538 return status;
4539}
4540
Tim Petersced69f82003-09-16 20:30:58 +00004541static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004542int fixswapcase(PyUnicodeObject *self)
4543{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004544 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004545 Py_UNICODE *s = self->str;
4546 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00004547
Guido van Rossumd57fd912000-03-10 22:53:23 +00004548 while (len-- > 0) {
4549 if (Py_UNICODE_ISUPPER(*s)) {
4550 *s = Py_UNICODE_TOLOWER(*s);
4551 status = 1;
4552 } else if (Py_UNICODE_ISLOWER(*s)) {
4553 *s = Py_UNICODE_TOUPPER(*s);
4554 status = 1;
4555 }
4556 s++;
4557 }
4558
4559 return status;
4560}
4561
Tim Petersced69f82003-09-16 20:30:58 +00004562static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004563int fixcapitalize(PyUnicodeObject *self)
4564{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004565 Py_ssize_t len = self->length;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00004566 Py_UNICODE *s = self->str;
4567 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00004568
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00004569 if (len == 0)
4570 return 0;
4571 if (Py_UNICODE_ISLOWER(*s)) {
4572 *s = Py_UNICODE_TOUPPER(*s);
4573 status = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004574 }
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00004575 s++;
4576 while (--len > 0) {
4577 if (Py_UNICODE_ISUPPER(*s)) {
4578 *s = Py_UNICODE_TOLOWER(*s);
4579 status = 1;
4580 }
4581 s++;
4582 }
4583 return status;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004584}
4585
4586static
4587int fixtitle(PyUnicodeObject *self)
4588{
4589 register Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
4590 register Py_UNICODE *e;
4591 int previous_is_cased;
4592
4593 /* Shortcut for single character strings */
4594 if (PyUnicode_GET_SIZE(self) == 1) {
4595 Py_UNICODE ch = Py_UNICODE_TOTITLE(*p);
4596 if (*p != ch) {
4597 *p = ch;
4598 return 1;
4599 }
4600 else
4601 return 0;
4602 }
Tim Petersced69f82003-09-16 20:30:58 +00004603
Guido van Rossumd57fd912000-03-10 22:53:23 +00004604 e = p + PyUnicode_GET_SIZE(self);
4605 previous_is_cased = 0;
4606 for (; p < e; p++) {
4607 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00004608
Guido van Rossumd57fd912000-03-10 22:53:23 +00004609 if (previous_is_cased)
4610 *p = Py_UNICODE_TOLOWER(ch);
4611 else
4612 *p = Py_UNICODE_TOTITLE(ch);
Tim Petersced69f82003-09-16 20:30:58 +00004613
4614 if (Py_UNICODE_ISLOWER(ch) ||
4615 Py_UNICODE_ISUPPER(ch) ||
Guido van Rossumd57fd912000-03-10 22:53:23 +00004616 Py_UNICODE_ISTITLE(ch))
4617 previous_is_cased = 1;
4618 else
4619 previous_is_cased = 0;
4620 }
4621 return 1;
4622}
4623
Tim Peters8ce9f162004-08-27 01:49:32 +00004624PyObject *
4625PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004626{
Tim Peters8ce9f162004-08-27 01:49:32 +00004627 PyObject *internal_separator = NULL;
Skip Montanaro6543b452004-09-16 03:28:13 +00004628 const Py_UNICODE blank = ' ';
4629 const Py_UNICODE *sep = &blank;
Thomas Wouters477c8d52006-05-27 19:21:47 +00004630 Py_ssize_t seplen = 1;
Tim Peters05eba1f2004-08-27 21:32:02 +00004631 PyUnicodeObject *res = NULL; /* the result */
Thomas Wouters477c8d52006-05-27 19:21:47 +00004632 Py_ssize_t res_alloc = 100; /* # allocated bytes for string in res */
4633 Py_ssize_t res_used; /* # used bytes */
Tim Peters05eba1f2004-08-27 21:32:02 +00004634 Py_UNICODE *res_p; /* pointer to free byte in res's string area */
4635 PyObject *fseq; /* PySequence_Fast(seq) */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004636 Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */
Tim Peters8ce9f162004-08-27 01:49:32 +00004637 PyObject *item;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004638 Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004639
Tim Peters05eba1f2004-08-27 21:32:02 +00004640 fseq = PySequence_Fast(seq, "");
4641 if (fseq == NULL) {
Tim Peters05eba1f2004-08-27 21:32:02 +00004642 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +00004643 }
4644
Tim Peters91879ab2004-08-27 22:35:44 +00004645 /* Grrrr. A codec may be invoked to convert str objects to
4646 * Unicode, and so it's possible to call back into Python code
4647 * during PyUnicode_FromObject(), and so it's possible for a sick
4648 * codec to change the size of fseq (if seq is a list). Therefore
4649 * we have to keep refetching the size -- can't assume seqlen
4650 * is invariant.
4651 */
Tim Peters05eba1f2004-08-27 21:32:02 +00004652 seqlen = PySequence_Fast_GET_SIZE(fseq);
4653 /* If empty sequence, return u"". */
4654 if (seqlen == 0) {
4655 res = _PyUnicode_New(0); /* empty sequence; return u"" */
4656 goto Done;
4657 }
4658 /* If singleton sequence with an exact Unicode, return that. */
4659 if (seqlen == 1) {
4660 item = PySequence_Fast_GET_ITEM(fseq, 0);
4661 if (PyUnicode_CheckExact(item)) {
4662 Py_INCREF(item);
4663 res = (PyUnicodeObject *)item;
4664 goto Done;
4665 }
Tim Peters8ce9f162004-08-27 01:49:32 +00004666 }
4667
Tim Peters05eba1f2004-08-27 21:32:02 +00004668 /* At least two items to join, or one that isn't exact Unicode. */
4669 if (seqlen > 1) {
4670 /* Set up sep and seplen -- they're needed. */
4671 if (separator == NULL) {
4672 sep = &blank;
4673 seplen = 1;
4674 }
4675 else {
4676 internal_separator = PyUnicode_FromObject(separator);
4677 if (internal_separator == NULL)
4678 goto onError;
4679 sep = PyUnicode_AS_UNICODE(internal_separator);
4680 seplen = PyUnicode_GET_SIZE(internal_separator);
Tim Peters91879ab2004-08-27 22:35:44 +00004681 /* In case PyUnicode_FromObject() mutated seq. */
4682 seqlen = PySequence_Fast_GET_SIZE(fseq);
Tim Peters05eba1f2004-08-27 21:32:02 +00004683 }
4684 }
4685
4686 /* Get space. */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004687 res = _PyUnicode_New(res_alloc);
Tim Peters05eba1f2004-08-27 21:32:02 +00004688 if (res == NULL)
Tim Peters8ce9f162004-08-27 01:49:32 +00004689 goto onError;
Tim Peters05eba1f2004-08-27 21:32:02 +00004690 res_p = PyUnicode_AS_UNICODE(res);
4691 res_used = 0;
Tim Peters8ce9f162004-08-27 01:49:32 +00004692
Tim Peters05eba1f2004-08-27 21:32:02 +00004693 for (i = 0; i < seqlen; ++i) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00004694 Py_ssize_t itemlen;
4695 Py_ssize_t new_res_used;
Tim Peters05eba1f2004-08-27 21:32:02 +00004696
4697 item = PySequence_Fast_GET_ITEM(fseq, i);
4698 /* Convert item to Unicode. */
4699 if (! PyUnicode_Check(item) && ! PyString_Check(item)) {
4700 PyErr_Format(PyExc_TypeError,
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004701 "sequence item %zd: expected string or Unicode,"
Tim Peters05eba1f2004-08-27 21:32:02 +00004702 " %.80s found",
4703 i, item->ob_type->tp_name);
Tim Peters2cfe3682001-05-05 05:36:48 +00004704 goto onError;
Tim Peters8ce9f162004-08-27 01:49:32 +00004705 }
Tim Peters05eba1f2004-08-27 21:32:02 +00004706 item = PyUnicode_FromObject(item);
4707 if (item == NULL)
4708 goto onError;
4709 /* We own a reference to item from here on. */
Tim Petersced69f82003-09-16 20:30:58 +00004710
Tim Peters91879ab2004-08-27 22:35:44 +00004711 /* In case PyUnicode_FromObject() mutated seq. */
4712 seqlen = PySequence_Fast_GET_SIZE(fseq);
4713
Tim Peters8ce9f162004-08-27 01:49:32 +00004714 /* Make sure we have enough space for the separator and the item. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004715 itemlen = PyUnicode_GET_SIZE(item);
Tim Peters05eba1f2004-08-27 21:32:02 +00004716 new_res_used = res_used + itemlen;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004717 if (new_res_used < 0)
Tim Peters8ce9f162004-08-27 01:49:32 +00004718 goto Overflow;
Tim Peters05eba1f2004-08-27 21:32:02 +00004719 if (i < seqlen - 1) {
4720 new_res_used += seplen;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004721 if (new_res_used < 0)
Tim Peters05eba1f2004-08-27 21:32:02 +00004722 goto Overflow;
4723 }
4724 if (new_res_used > res_alloc) {
4725 /* double allocated size until it's big enough */
Tim Peters8ce9f162004-08-27 01:49:32 +00004726 do {
Tim Peters05eba1f2004-08-27 21:32:02 +00004727 res_alloc += res_alloc;
Thomas Wouters477c8d52006-05-27 19:21:47 +00004728 if (res_alloc <= 0)
Tim Peters8ce9f162004-08-27 01:49:32 +00004729 goto Overflow;
Tim Peters05eba1f2004-08-27 21:32:02 +00004730 } while (new_res_used > res_alloc);
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004731 if (_PyUnicode_Resize(&res, res_alloc) < 0) {
Marc-André Lemburg3508e302001-09-20 17:22:58 +00004732 Py_DECREF(item);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004733 goto onError;
Marc-André Lemburg3508e302001-09-20 17:22:58 +00004734 }
Tim Peters05eba1f2004-08-27 21:32:02 +00004735 res_p = PyUnicode_AS_UNICODE(res) + res_used;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004736 }
Tim Peters05eba1f2004-08-27 21:32:02 +00004737
4738 /* Copy item, and maybe the separator. */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004739 Py_UNICODE_COPY(res_p, PyUnicode_AS_UNICODE(item), itemlen);
Tim Peters05eba1f2004-08-27 21:32:02 +00004740 res_p += itemlen;
4741 if (i < seqlen - 1) {
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004742 Py_UNICODE_COPY(res_p, sep, seplen);
Tim Peters05eba1f2004-08-27 21:32:02 +00004743 res_p += seplen;
4744 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004745 Py_DECREF(item);
Tim Peters05eba1f2004-08-27 21:32:02 +00004746 res_used = new_res_used;
4747 }
Tim Peters8ce9f162004-08-27 01:49:32 +00004748
Tim Peters05eba1f2004-08-27 21:32:02 +00004749 /* Shrink res to match the used area; this probably can't fail,
4750 * but it's cheap to check.
4751 */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004752 if (_PyUnicode_Resize(&res, res_used) < 0)
Tim Peters8ce9f162004-08-27 01:49:32 +00004753 goto onError;
4754
4755 Done:
4756 Py_XDECREF(internal_separator);
Tim Peters05eba1f2004-08-27 21:32:02 +00004757 Py_DECREF(fseq);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004758 return (PyObject *)res;
4759
Tim Peters8ce9f162004-08-27 01:49:32 +00004760 Overflow:
4761 PyErr_SetString(PyExc_OverflowError,
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004762 "join() result is too long for a Python string");
Tim Peters8ce9f162004-08-27 01:49:32 +00004763 Py_DECREF(item);
4764 /* fall through */
4765
Guido van Rossumd57fd912000-03-10 22:53:23 +00004766 onError:
Tim Peters8ce9f162004-08-27 01:49:32 +00004767 Py_XDECREF(internal_separator);
Tim Peters05eba1f2004-08-27 21:32:02 +00004768 Py_DECREF(fseq);
Tim Peters8ce9f162004-08-27 01:49:32 +00004769 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004770 return NULL;
4771}
4772
Tim Petersced69f82003-09-16 20:30:58 +00004773static
4774PyUnicodeObject *pad(PyUnicodeObject *self,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004775 Py_ssize_t left,
4776 Py_ssize_t right,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004777 Py_UNICODE fill)
4778{
4779 PyUnicodeObject *u;
4780
4781 if (left < 0)
4782 left = 0;
4783 if (right < 0)
4784 right = 0;
4785
Tim Peters7a29bd52001-09-12 03:03:31 +00004786 if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004787 Py_INCREF(self);
4788 return self;
4789 }
4790
4791 u = _PyUnicode_New(left + self->length + right);
4792 if (u) {
4793 if (left)
4794 Py_UNICODE_FILL(u->str, fill, left);
4795 Py_UNICODE_COPY(u->str + left, self->str, self->length);
4796 if (right)
4797 Py_UNICODE_FILL(u->str + left + self->length, fill, right);
4798 }
4799
4800 return u;
4801}
4802
4803#define SPLIT_APPEND(data, left, right) \
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004804 str = PyUnicode_FromUnicode((data) + (left), (right) - (left)); \
Guido van Rossumd57fd912000-03-10 22:53:23 +00004805 if (!str) \
4806 goto onError; \
4807 if (PyList_Append(list, str)) { \
4808 Py_DECREF(str); \
4809 goto onError; \
4810 } \
4811 else \
4812 Py_DECREF(str);
4813
4814static
4815PyObject *split_whitespace(PyUnicodeObject *self,
4816 PyObject *list,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004817 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004818{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004819 register Py_ssize_t i;
4820 register Py_ssize_t j;
4821 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004822 PyObject *str;
4823
4824 for (i = j = 0; i < len; ) {
4825 /* find a token */
4826 while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
4827 i++;
4828 j = i;
4829 while (i < len && !Py_UNICODE_ISSPACE(self->str[i]))
4830 i++;
4831 if (j < i) {
4832 if (maxcount-- <= 0)
4833 break;
4834 SPLIT_APPEND(self->str, j, i);
4835 while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
4836 i++;
4837 j = i;
4838 }
4839 }
4840 if (j < len) {
4841 SPLIT_APPEND(self->str, j, len);
4842 }
4843 return list;
4844
4845 onError:
4846 Py_DECREF(list);
4847 return NULL;
4848}
4849
4850PyObject *PyUnicode_Splitlines(PyObject *string,
Guido van Rossum86662912000-04-11 15:38:46 +00004851 int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004852{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004853 register Py_ssize_t i;
4854 register Py_ssize_t j;
4855 Py_ssize_t len;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004856 PyObject *list;
4857 PyObject *str;
4858 Py_UNICODE *data;
4859
4860 string = PyUnicode_FromObject(string);
4861 if (string == NULL)
4862 return NULL;
4863 data = PyUnicode_AS_UNICODE(string);
4864 len = PyUnicode_GET_SIZE(string);
4865
Guido van Rossumd57fd912000-03-10 22:53:23 +00004866 list = PyList_New(0);
4867 if (!list)
4868 goto onError;
4869
4870 for (i = j = 0; i < len; ) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00004871 Py_ssize_t eol;
Tim Petersced69f82003-09-16 20:30:58 +00004872
Guido van Rossumd57fd912000-03-10 22:53:23 +00004873 /* Find a line and append it */
Thomas Wouters477c8d52006-05-27 19:21:47 +00004874 while (i < len && !BLOOM_LINEBREAK(data[i]))
Guido van Rossumd57fd912000-03-10 22:53:23 +00004875 i++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004876
4877 /* Skip the line break reading CRLF as one line break */
Guido van Rossum86662912000-04-11 15:38:46 +00004878 eol = i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004879 if (i < len) {
4880 if (data[i] == '\r' && i + 1 < len &&
4881 data[i+1] == '\n')
4882 i += 2;
4883 else
4884 i++;
Guido van Rossum86662912000-04-11 15:38:46 +00004885 if (keepends)
4886 eol = i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004887 }
Guido van Rossum86662912000-04-11 15:38:46 +00004888 SPLIT_APPEND(data, j, eol);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004889 j = i;
4890 }
4891 if (j < len) {
4892 SPLIT_APPEND(data, j, len);
4893 }
4894
4895 Py_DECREF(string);
4896 return list;
4897
4898 onError:
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00004899 Py_XDECREF(list);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004900 Py_DECREF(string);
4901 return NULL;
4902}
4903
Tim Petersced69f82003-09-16 20:30:58 +00004904static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004905PyObject *split_char(PyUnicodeObject *self,
4906 PyObject *list,
4907 Py_UNICODE ch,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004908 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004909{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004910 register Py_ssize_t i;
4911 register Py_ssize_t j;
4912 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004913 PyObject *str;
4914
4915 for (i = j = 0; i < len; ) {
4916 if (self->str[i] == ch) {
4917 if (maxcount-- <= 0)
4918 break;
4919 SPLIT_APPEND(self->str, j, i);
4920 i = j = i + 1;
4921 } else
4922 i++;
4923 }
4924 if (j <= len) {
4925 SPLIT_APPEND(self->str, j, len);
4926 }
4927 return list;
4928
4929 onError:
4930 Py_DECREF(list);
4931 return NULL;
4932}
4933
Tim Petersced69f82003-09-16 20:30:58 +00004934static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004935PyObject *split_substring(PyUnicodeObject *self,
4936 PyObject *list,
4937 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004938 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004939{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004940 register Py_ssize_t i;
4941 register Py_ssize_t j;
4942 Py_ssize_t len = self->length;
4943 Py_ssize_t sublen = substring->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004944 PyObject *str;
4945
Guido van Rossumcda4f9a2000-12-19 02:23:19 +00004946 for (i = j = 0; i <= len - sublen; ) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004947 if (Py_UNICODE_MATCH(self, i, substring)) {
4948 if (maxcount-- <= 0)
4949 break;
4950 SPLIT_APPEND(self->str, j, i);
4951 i = j = i + sublen;
4952 } else
4953 i++;
4954 }
4955 if (j <= len) {
4956 SPLIT_APPEND(self->str, j, len);
4957 }
4958 return list;
4959
4960 onError:
4961 Py_DECREF(list);
4962 return NULL;
4963}
4964
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004965static
4966PyObject *rsplit_whitespace(PyUnicodeObject *self,
4967 PyObject *list,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004968 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004969{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004970 register Py_ssize_t i;
4971 register Py_ssize_t j;
4972 Py_ssize_t len = self->length;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004973 PyObject *str;
4974
4975 for (i = j = len - 1; i >= 0; ) {
4976 /* find a token */
4977 while (i >= 0 && Py_UNICODE_ISSPACE(self->str[i]))
4978 i--;
4979 j = i;
4980 while (i >= 0 && !Py_UNICODE_ISSPACE(self->str[i]))
4981 i--;
4982 if (j > i) {
4983 if (maxcount-- <= 0)
4984 break;
Thomas Wouters477c8d52006-05-27 19:21:47 +00004985 SPLIT_APPEND(self->str, i + 1, j + 1);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004986 while (i >= 0 && Py_UNICODE_ISSPACE(self->str[i]))
4987 i--;
4988 j = i;
4989 }
4990 }
4991 if (j >= 0) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00004992 SPLIT_APPEND(self->str, 0, j + 1);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004993 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00004994 if (PyList_Reverse(list) < 0)
4995 goto onError;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004996 return list;
4997
4998 onError:
4999 Py_DECREF(list);
5000 return NULL;
5001}
5002
5003static
5004PyObject *rsplit_char(PyUnicodeObject *self,
5005 PyObject *list,
5006 Py_UNICODE ch,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005007 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005008{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005009 register Py_ssize_t i;
5010 register Py_ssize_t j;
5011 Py_ssize_t len = self->length;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005012 PyObject *str;
5013
5014 for (i = j = len - 1; i >= 0; ) {
5015 if (self->str[i] == ch) {
5016 if (maxcount-- <= 0)
5017 break;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005018 SPLIT_APPEND(self->str, i + 1, j + 1);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005019 j = i = i - 1;
5020 } else
5021 i--;
5022 }
Hye-Shik Chang7fc4cf52003-12-23 09:10:16 +00005023 if (j >= -1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00005024 SPLIT_APPEND(self->str, 0, j + 1);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005025 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00005026 if (PyList_Reverse(list) < 0)
5027 goto onError;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005028 return list;
5029
5030 onError:
5031 Py_DECREF(list);
5032 return NULL;
5033}
5034
5035static
5036PyObject *rsplit_substring(PyUnicodeObject *self,
5037 PyObject *list,
5038 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005039 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005040{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005041 register Py_ssize_t i;
5042 register Py_ssize_t j;
5043 Py_ssize_t len = self->length;
5044 Py_ssize_t sublen = substring->length;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005045 PyObject *str;
5046
5047 for (i = len - sublen, j = len; i >= 0; ) {
5048 if (Py_UNICODE_MATCH(self, i, substring)) {
5049 if (maxcount-- <= 0)
5050 break;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005051 SPLIT_APPEND(self->str, i + sublen, j);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005052 j = i;
5053 i -= sublen;
5054 } else
5055 i--;
5056 }
5057 if (j >= 0) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00005058 SPLIT_APPEND(self->str, 0, j);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005059 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00005060 if (PyList_Reverse(list) < 0)
5061 goto onError;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005062 return list;
5063
5064 onError:
5065 Py_DECREF(list);
5066 return NULL;
5067}
5068
Guido van Rossumd57fd912000-03-10 22:53:23 +00005069#undef SPLIT_APPEND
5070
5071static
5072PyObject *split(PyUnicodeObject *self,
5073 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005074 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005075{
5076 PyObject *list;
5077
5078 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005079 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005080
5081 list = PyList_New(0);
5082 if (!list)
5083 return NULL;
5084
5085 if (substring == NULL)
5086 return split_whitespace(self,list,maxcount);
5087
5088 else if (substring->length == 1)
5089 return split_char(self,list,substring->str[0],maxcount);
5090
5091 else if (substring->length == 0) {
5092 Py_DECREF(list);
5093 PyErr_SetString(PyExc_ValueError, "empty separator");
5094 return NULL;
5095 }
5096 else
5097 return split_substring(self,list,substring,maxcount);
5098}
5099
Tim Petersced69f82003-09-16 20:30:58 +00005100static
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005101PyObject *rsplit(PyUnicodeObject *self,
5102 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005103 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005104{
5105 PyObject *list;
5106
5107 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005108 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005109
5110 list = PyList_New(0);
5111 if (!list)
5112 return NULL;
5113
5114 if (substring == NULL)
5115 return rsplit_whitespace(self,list,maxcount);
5116
5117 else if (substring->length == 1)
5118 return rsplit_char(self,list,substring->str[0],maxcount);
5119
5120 else if (substring->length == 0) {
5121 Py_DECREF(list);
5122 PyErr_SetString(PyExc_ValueError, "empty separator");
5123 return NULL;
5124 }
5125 else
5126 return rsplit_substring(self,list,substring,maxcount);
5127}
5128
5129static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005130PyObject *replace(PyUnicodeObject *self,
5131 PyUnicodeObject *str1,
5132 PyUnicodeObject *str2,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005133 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005134{
5135 PyUnicodeObject *u;
5136
5137 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005138 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005139
Thomas Wouters477c8d52006-05-27 19:21:47 +00005140 if (str1->length == str2->length) {
5141 /* same length */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005142 Py_ssize_t i;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005143 if (str1->length == 1) {
5144 /* replace characters */
5145 Py_UNICODE u1, u2;
5146 if (!findchar(self->str, self->length, str1->str[0]))
5147 goto nothing;
5148 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
5149 if (!u)
5150 return NULL;
5151 Py_UNICODE_COPY(u->str, self->str, self->length);
5152 u1 = str1->str[0];
5153 u2 = str2->str[0];
5154 for (i = 0; i < u->length; i++)
5155 if (u->str[i] == u1) {
5156 if (--maxcount < 0)
5157 break;
5158 u->str[i] = u2;
5159 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005160 } else {
Thomas Wouters477c8d52006-05-27 19:21:47 +00005161 i = fastsearch(
5162 self->str, self->length, str1->str, str1->length, FAST_SEARCH
Guido van Rossumd57fd912000-03-10 22:53:23 +00005163 );
Thomas Wouters477c8d52006-05-27 19:21:47 +00005164 if (i < 0)
5165 goto nothing;
5166 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
5167 if (!u)
5168 return NULL;
5169 Py_UNICODE_COPY(u->str, self->str, self->length);
5170 while (i <= self->length - str1->length)
5171 if (Py_UNICODE_MATCH(self, i, str1)) {
5172 if (--maxcount < 0)
5173 break;
5174 Py_UNICODE_COPY(u->str+i, str2->str, str2->length);
5175 i += str1->length;
5176 } else
5177 i++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005178 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005179 } else {
Thomas Wouters477c8d52006-05-27 19:21:47 +00005180
5181 Py_ssize_t n, i, j, e;
5182 Py_ssize_t product, new_size, delta;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005183 Py_UNICODE *p;
5184
5185 /* replace strings */
Thomas Wouters477c8d52006-05-27 19:21:47 +00005186 n = stringlib_count(self->str, self->length, str1->str, str1->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005187 if (n > maxcount)
5188 n = maxcount;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005189 if (n == 0)
5190 goto nothing;
5191 /* new_size = self->length + n * (str2->length - str1->length)); */
5192 delta = (str2->length - str1->length);
5193 if (delta == 0) {
5194 new_size = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005195 } else {
Thomas Wouters477c8d52006-05-27 19:21:47 +00005196 product = n * (str2->length - str1->length);
5197 if ((product / (str2->length - str1->length)) != n) {
5198 PyErr_SetString(PyExc_OverflowError,
5199 "replace string is too long");
5200 return NULL;
5201 }
5202 new_size = self->length + product;
5203 if (new_size < 0) {
5204 PyErr_SetString(PyExc_OverflowError,
5205 "replace string is too long");
5206 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005207 }
5208 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00005209 u = _PyUnicode_New(new_size);
5210 if (!u)
5211 return NULL;
5212 i = 0;
5213 p = u->str;
5214 e = self->length - str1->length;
5215 if (str1->length > 0) {
5216 while (n-- > 0) {
5217 /* look for next match */
5218 j = i;
5219 while (j <= e) {
5220 if (Py_UNICODE_MATCH(self, j, str1))
5221 break;
5222 j++;
5223 }
5224 if (j > i) {
5225 if (j > e)
5226 break;
5227 /* copy unchanged part [i:j] */
5228 Py_UNICODE_COPY(p, self->str+i, j-i);
5229 p += j - i;
5230 }
5231 /* copy substitution string */
5232 if (str2->length > 0) {
5233 Py_UNICODE_COPY(p, str2->str, str2->length);
5234 p += str2->length;
5235 }
5236 i = j + str1->length;
5237 }
5238 if (i < self->length)
5239 /* copy tail [i:] */
5240 Py_UNICODE_COPY(p, self->str+i, self->length-i);
5241 } else {
5242 /* interleave */
5243 while (n > 0) {
5244 Py_UNICODE_COPY(p, str2->str, str2->length);
5245 p += str2->length;
5246 if (--n <= 0)
5247 break;
5248 *p++ = self->str[i++];
5249 }
5250 Py_UNICODE_COPY(p, self->str+i, self->length-i);
5251 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005252 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005253 return (PyObject *) u;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005254
5255nothing:
5256 /* nothing to replace; return original string (when possible) */
5257 if (PyUnicode_CheckExact(self)) {
5258 Py_INCREF(self);
5259 return (PyObject *) self;
5260 }
5261 return PyUnicode_FromUnicode(self->str, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005262}
5263
5264/* --- Unicode Object Methods --------------------------------------------- */
5265
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005266PyDoc_STRVAR(title__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005267"S.title() -> unicode\n\
5268\n\
5269Return a titlecased version of S, i.e. words start with title case\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005270characters, all remaining cased characters have lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005271
5272static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005273unicode_title(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005274{
Guido van Rossumd57fd912000-03-10 22:53:23 +00005275 return fixup(self, fixtitle);
5276}
5277
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005278PyDoc_STRVAR(capitalize__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005279"S.capitalize() -> unicode\n\
5280\n\
5281Return a capitalized version of S, i.e. make the first character\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005282have upper case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005283
5284static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005285unicode_capitalize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005286{
Guido van Rossumd57fd912000-03-10 22:53:23 +00005287 return fixup(self, fixcapitalize);
5288}
5289
5290#if 0
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005291PyDoc_STRVAR(capwords__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005292"S.capwords() -> unicode\n\
5293\n\
5294Apply .capitalize() to all words in S and return the result with\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005295normalized whitespace (all whitespace strings are replaced by ' ').");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005296
5297static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005298unicode_capwords(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005299{
5300 PyObject *list;
5301 PyObject *item;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005302 Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005303
Guido van Rossumd57fd912000-03-10 22:53:23 +00005304 /* Split into words */
5305 list = split(self, NULL, -1);
5306 if (!list)
5307 return NULL;
5308
5309 /* Capitalize each word */
5310 for (i = 0; i < PyList_GET_SIZE(list); i++) {
5311 item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
5312 fixcapitalize);
5313 if (item == NULL)
5314 goto onError;
5315 Py_DECREF(PyList_GET_ITEM(list, i));
5316 PyList_SET_ITEM(list, i, item);
5317 }
5318
5319 /* Join the words to form a new string */
5320 item = PyUnicode_Join(NULL, list);
5321
5322onError:
5323 Py_DECREF(list);
5324 return (PyObject *)item;
5325}
5326#endif
5327
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00005328/* Argument converter. Coerces to a single unicode character */
5329
5330static int
5331convert_uc(PyObject *obj, void *addr)
5332{
5333 Py_UNICODE *fillcharloc = (Py_UNICODE *)addr;
5334 PyObject *uniobj;
5335 Py_UNICODE *unistr;
5336
5337 uniobj = PyUnicode_FromObject(obj);
5338 if (uniobj == NULL) {
5339 PyErr_SetString(PyExc_TypeError,
5340 "The fill character cannot be converted to Unicode");
5341 return 0;
5342 }
5343 if (PyUnicode_GET_SIZE(uniobj) != 1) {
5344 PyErr_SetString(PyExc_TypeError,
5345 "The fill character must be exactly one character long");
5346 Py_DECREF(uniobj);
5347 return 0;
5348 }
5349 unistr = PyUnicode_AS_UNICODE(uniobj);
5350 *fillcharloc = unistr[0];
5351 Py_DECREF(uniobj);
5352 return 1;
5353}
5354
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005355PyDoc_STRVAR(center__doc__,
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00005356"S.center(width[, fillchar]) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005357\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00005358Return S centered in a Unicode string of length width. Padding is\n\
5359done using the specified fill character (default is a space)");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005360
5361static PyObject *
5362unicode_center(PyUnicodeObject *self, PyObject *args)
5363{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005364 Py_ssize_t marg, left;
5365 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00005366 Py_UNICODE fillchar = ' ';
Guido van Rossumd57fd912000-03-10 22:53:23 +00005367
Thomas Woutersde017742006-02-16 19:34:37 +00005368 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005369 return NULL;
5370
Tim Peters7a29bd52001-09-12 03:03:31 +00005371 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005372 Py_INCREF(self);
5373 return (PyObject*) self;
5374 }
5375
5376 marg = width - self->length;
5377 left = marg / 2 + (marg & width & 1);
5378
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00005379 return (PyObject*) pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005380}
5381
Marc-André Lemburge5034372000-08-08 08:04:29 +00005382#if 0
5383
5384/* This code should go into some future Unicode collation support
5385 module. The basic comparison should compare ordinals on a naive
Trent Mick20abf572000-08-12 22:14:34 +00005386 basis (this is what Java does and thus JPython too). */
Marc-André Lemburge5034372000-08-08 08:04:29 +00005387
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005388/* speedy UTF-16 code point order comparison */
5389/* gleaned from: */
5390/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
5391
Marc-André Lemburge12896e2000-07-07 17:51:08 +00005392static short utf16Fixup[32] =
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005393{
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005394 0, 0, 0, 0, 0, 0, 0, 0,
Tim Petersced69f82003-09-16 20:30:58 +00005395 0, 0, 0, 0, 0, 0, 0, 0,
5396 0, 0, 0, 0, 0, 0, 0, 0,
Marc-André Lemburge12896e2000-07-07 17:51:08 +00005397 0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005398};
5399
Guido van Rossumd57fd912000-03-10 22:53:23 +00005400static int
5401unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
5402{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005403 Py_ssize_t len1, len2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005404
Guido van Rossumd57fd912000-03-10 22:53:23 +00005405 Py_UNICODE *s1 = str1->str;
5406 Py_UNICODE *s2 = str2->str;
5407
5408 len1 = str1->length;
5409 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00005410
Guido van Rossumd57fd912000-03-10 22:53:23 +00005411 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00005412 Py_UNICODE c1, c2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005413
5414 c1 = *s1++;
5415 c2 = *s2++;
Fredrik Lundh45714e92001-06-26 16:39:36 +00005416
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005417 if (c1 > (1<<11) * 26)
5418 c1 += utf16Fixup[c1>>11];
5419 if (c2 > (1<<11) * 26)
5420 c2 += utf16Fixup[c2>>11];
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005421 /* now c1 and c2 are in UTF-32-compatible order */
Fredrik Lundh45714e92001-06-26 16:39:36 +00005422
5423 if (c1 != c2)
5424 return (c1 < c2) ? -1 : 1;
Tim Petersced69f82003-09-16 20:30:58 +00005425
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005426 len1--; len2--;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005427 }
5428
5429 return (len1 < len2) ? -1 : (len1 != len2);
5430}
5431
Marc-André Lemburge5034372000-08-08 08:04:29 +00005432#else
5433
5434static int
5435unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
5436{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005437 register Py_ssize_t len1, len2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00005438
5439 Py_UNICODE *s1 = str1->str;
5440 Py_UNICODE *s2 = str2->str;
5441
5442 len1 = str1->length;
5443 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00005444
Marc-André Lemburge5034372000-08-08 08:04:29 +00005445 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00005446 Py_UNICODE c1, c2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00005447
Fredrik Lundh45714e92001-06-26 16:39:36 +00005448 c1 = *s1++;
5449 c2 = *s2++;
5450
5451 if (c1 != c2)
5452 return (c1 < c2) ? -1 : 1;
5453
Marc-André Lemburge5034372000-08-08 08:04:29 +00005454 len1--; len2--;
5455 }
5456
5457 return (len1 < len2) ? -1 : (len1 != len2);
5458}
5459
5460#endif
5461
Guido van Rossumd57fd912000-03-10 22:53:23 +00005462int PyUnicode_Compare(PyObject *left,
5463 PyObject *right)
5464{
Guido van Rossum09dc34f2007-05-04 04:17:33 +00005465 if (PyUnicode_Check(left) && PyUnicode_Check(right))
5466 return unicode_compare((PyUnicodeObject *)left,
5467 (PyUnicodeObject *)right);
5468 if ((PyString_Check(left) && PyUnicode_Check(right)) ||
5469 (PyUnicode_Check(left) && PyString_Check(right))) {
5470 if (PyUnicode_Check(left))
5471 left = _PyUnicode_AsDefaultEncodedString(left, NULL);
5472 if (PyUnicode_Check(right))
5473 right = _PyUnicode_AsDefaultEncodedString(right, NULL);
5474 assert(PyString_Check(left));
5475 assert(PyString_Check(right));
5476 return PyObject_Compare(left, right);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005477 }
Guido van Rossum09dc34f2007-05-04 04:17:33 +00005478 PyErr_Format(PyExc_TypeError,
5479 "Can't compare %.100s and %.100s",
5480 left->ob_type->tp_name,
5481 right->ob_type->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005482 return -1;
5483}
5484
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00005485PyObject *PyUnicode_RichCompare(PyObject *left,
5486 PyObject *right,
5487 int op)
5488{
5489 int result;
5490
5491 result = PyUnicode_Compare(left, right);
5492 if (result == -1 && PyErr_Occurred())
5493 goto onError;
5494
5495 /* Convert the return value to a Boolean */
5496 switch (op) {
5497 case Py_EQ:
5498 result = (result == 0);
5499 break;
5500 case Py_NE:
5501 result = (result != 0);
5502 break;
5503 case Py_LE:
5504 result = (result <= 0);
5505 break;
5506 case Py_GE:
5507 result = (result >= 0);
5508 break;
5509 case Py_LT:
5510 result = (result == -1);
5511 break;
5512 case Py_GT:
5513 result = (result == 1);
5514 break;
5515 }
5516 return PyBool_FromLong(result);
5517
5518 onError:
5519
5520 /* Standard case
5521
5522 Type errors mean that PyUnicode_FromObject() could not convert
5523 one of the arguments (usually the right hand side) to Unicode,
5524 ie. we can't handle the comparison request. However, it is
5525 possible that the other object knows a comparison method, which
5526 is why we return Py_NotImplemented to give the other object a
5527 chance.
5528
5529 */
5530 if (PyErr_ExceptionMatches(PyExc_TypeError)) {
5531 PyErr_Clear();
5532 Py_INCREF(Py_NotImplemented);
5533 return Py_NotImplemented;
5534 }
5535 if (op != Py_EQ && op != Py_NE)
5536 return NULL;
5537
5538 /* Equality comparison.
5539
5540 This is a special case: we silence any PyExc_UnicodeDecodeError
5541 and instead turn it into a PyErr_UnicodeWarning.
5542
5543 */
5544 if (!PyErr_ExceptionMatches(PyExc_UnicodeDecodeError))
5545 return NULL;
5546 PyErr_Clear();
5547 if (PyErr_Warn(PyExc_UnicodeWarning,
5548 (op == Py_EQ) ?
5549 "Unicode equal comparison "
5550 "failed to convert both arguments to Unicode - "
5551 "interpreting them as being unequal" :
5552 "Unicode unequal comparison "
5553 "failed to convert both arguments to Unicode - "
5554 "interpreting them as being unequal"
5555 ) < 0)
5556 return NULL;
5557 result = (op == Py_NE);
5558 return PyBool_FromLong(result);
5559}
5560
Guido van Rossum403d68b2000-03-13 15:55:09 +00005561int PyUnicode_Contains(PyObject *container,
5562 PyObject *element)
5563{
Thomas Wouters477c8d52006-05-27 19:21:47 +00005564 PyObject *str, *sub;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005565 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00005566
5567 /* Coerce the two arguments */
Thomas Wouters477c8d52006-05-27 19:21:47 +00005568 sub = PyUnicode_FromObject(element);
5569 if (!sub) {
Marc-André Lemburg7c014682000-06-28 08:11:47 +00005570 PyErr_SetString(PyExc_TypeError,
Barry Warsaw817918c2002-08-06 16:58:21 +00005571 "'in <string>' requires string as left operand");
Thomas Wouters477c8d52006-05-27 19:21:47 +00005572 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +00005573 }
5574
Thomas Wouters477c8d52006-05-27 19:21:47 +00005575 str = PyUnicode_FromObject(container);
5576 if (!str) {
5577 Py_DECREF(sub);
5578 return -1;
5579 }
5580
5581 result = stringlib_contains_obj(str, sub);
5582
5583 Py_DECREF(str);
5584 Py_DECREF(sub);
5585
Guido van Rossum403d68b2000-03-13 15:55:09 +00005586 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00005587}
5588
Guido van Rossumd57fd912000-03-10 22:53:23 +00005589/* Concat to string or Unicode object giving a new Unicode object. */
5590
5591PyObject *PyUnicode_Concat(PyObject *left,
5592 PyObject *right)
5593{
5594 PyUnicodeObject *u = NULL, *v = NULL, *w;
5595
Guido van Rossum84d79dd2007-04-13 02:23:57 +00005596 if (PyBytes_Check(left) || PyBytes_Check(right))
5597 return PyBytes_Concat(left, right);
5598
Guido van Rossumd57fd912000-03-10 22:53:23 +00005599 /* Coerce the two arguments */
5600 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
5601 if (u == NULL)
5602 goto onError;
5603 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
5604 if (v == NULL)
5605 goto onError;
5606
5607 /* Shortcuts */
5608 if (v == unicode_empty) {
5609 Py_DECREF(v);
5610 return (PyObject *)u;
5611 }
5612 if (u == unicode_empty) {
5613 Py_DECREF(u);
5614 return (PyObject *)v;
5615 }
5616
5617 /* Concat the two Unicode strings */
5618 w = _PyUnicode_New(u->length + v->length);
5619 if (w == NULL)
5620 goto onError;
5621 Py_UNICODE_COPY(w->str, u->str, u->length);
5622 Py_UNICODE_COPY(w->str + u->length, v->str, v->length);
5623
5624 Py_DECREF(u);
5625 Py_DECREF(v);
5626 return (PyObject *)w;
5627
5628onError:
5629 Py_XDECREF(u);
5630 Py_XDECREF(v);
5631 return NULL;
5632}
5633
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005634PyDoc_STRVAR(count__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005635"S.count(sub[, start[, end]]) -> int\n\
5636\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00005637Return the number of non-overlapping occurrences of substring sub in\n\
5638Unicode string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005639interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005640
5641static PyObject *
5642unicode_count(PyUnicodeObject *self, PyObject *args)
5643{
5644 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005645 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005646 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005647 PyObject *result;
5648
Guido van Rossumb8872e62000-05-09 14:14:27 +00005649 if (!PyArg_ParseTuple(args, "O|O&O&:count", &substring,
5650 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005651 return NULL;
5652
5653 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Thomas Wouters477c8d52006-05-27 19:21:47 +00005654 (PyObject *)substring);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005655 if (substring == NULL)
5656 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +00005657
Thomas Wouters477c8d52006-05-27 19:21:47 +00005658 FIX_START_END(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005659
Thomas Wouters477c8d52006-05-27 19:21:47 +00005660 result = PyInt_FromSsize_t(
5661 stringlib_count(self->str + start, end - start,
5662 substring->str, substring->length)
5663 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00005664
5665 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00005666
Guido van Rossumd57fd912000-03-10 22:53:23 +00005667 return result;
5668}
5669
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005670PyDoc_STRVAR(encode__doc__,
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00005671"S.encode([encoding[,errors]]) -> string or unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005672\n\
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00005673Encodes S using the codec registered for encoding. encoding defaults\n\
5674to the default encoding. errors may be given to set a different error\n\
Fred Drakee4315f52000-05-09 19:53:39 +00005675handling scheme. Default is 'strict' meaning that encoding errors raise\n\
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005676a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
5677'xmlcharrefreplace' as well as any other name registered with\n\
5678codecs.register_error that can handle UnicodeEncodeErrors.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005679
5680static PyObject *
5681unicode_encode(PyUnicodeObject *self, PyObject *args)
5682{
5683 char *encoding = NULL;
5684 char *errors = NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00005685 PyObject *v;
5686
Guido van Rossumd57fd912000-03-10 22:53:23 +00005687 if (!PyArg_ParseTuple(args, "|ss:encode", &encoding, &errors))
5688 return NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00005689 v = PyUnicode_AsEncodedObject((PyObject *)self, encoding, errors);
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00005690 if (v == NULL)
5691 goto onError;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00005692 if (!PyBytes_Check(v)) {
Guido van Rossum4355a472007-05-04 05:00:04 +00005693 if (PyString_Check(v)) {
5694 /* Old codec, turn it into bytes */
5695 PyObject *b = PyBytes_FromObject(v);
5696 Py_DECREF(v);
5697 return b;
5698 }
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00005699 PyErr_Format(PyExc_TypeError,
Guido van Rossumf15a29f2007-05-04 00:41:39 +00005700 "encoder did not return a bytes object "
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00005701 "(type=%.400s)",
5702 v->ob_type->tp_name);
5703 Py_DECREF(v);
5704 return NULL;
5705 }
5706 return v;
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00005707
5708 onError:
5709 return NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00005710}
5711
5712PyDoc_STRVAR(decode__doc__,
5713"S.decode([encoding[,errors]]) -> string or unicode\n\
5714\n\
5715Decodes S using the codec registered for encoding. encoding defaults\n\
5716to the default encoding. errors may be given to set a different error\n\
5717handling scheme. Default is 'strict' meaning that encoding errors raise\n\
5718a UnicodeDecodeError. Other possible values are 'ignore' and 'replace'\n\
5719as well as any other name registerd with codecs.register_error that is\n\
5720able to handle UnicodeDecodeErrors.");
5721
5722static PyObject *
Marc-André Lemburg126b44c2004-07-10 12:04:20 +00005723unicode_decode(PyUnicodeObject *self, PyObject *args)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00005724{
5725 char *encoding = NULL;
5726 char *errors = NULL;
5727 PyObject *v;
5728
5729 if (!PyArg_ParseTuple(args, "|ss:decode", &encoding, &errors))
5730 return NULL;
5731 v = PyUnicode_AsDecodedObject((PyObject *)self, encoding, errors);
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00005732 if (v == NULL)
5733 goto onError;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00005734 if (!PyString_Check(v) && !PyUnicode_Check(v)) {
5735 PyErr_Format(PyExc_TypeError,
5736 "decoder did not return a string/unicode object "
5737 "(type=%.400s)",
5738 v->ob_type->tp_name);
5739 Py_DECREF(v);
5740 return NULL;
5741 }
5742 return v;
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00005743
5744 onError:
5745 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005746}
5747
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005748PyDoc_STRVAR(expandtabs__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005749"S.expandtabs([tabsize]) -> unicode\n\
5750\n\
5751Return a copy of S where all tab characters are expanded using spaces.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005752If tabsize is not given, a tab size of 8 characters is assumed.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005753
5754static PyObject*
5755unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
5756{
5757 Py_UNICODE *e;
5758 Py_UNICODE *p;
5759 Py_UNICODE *q;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005760 Py_ssize_t i, j;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005761 PyUnicodeObject *u;
5762 int tabsize = 8;
5763
5764 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
5765 return NULL;
5766
Thomas Wouters7e474022000-07-16 12:04:32 +00005767 /* First pass: determine size of output string */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005768 i = j = 0;
5769 e = self->str + self->length;
5770 for (p = self->str; p < e; p++)
5771 if (*p == '\t') {
5772 if (tabsize > 0)
5773 j += tabsize - (j % tabsize);
5774 }
5775 else {
5776 j++;
5777 if (*p == '\n' || *p == '\r') {
5778 i += j;
5779 j = 0;
5780 }
5781 }
5782
5783 /* Second pass: create output string and fill it */
5784 u = _PyUnicode_New(i + j);
5785 if (!u)
5786 return NULL;
5787
5788 j = 0;
5789 q = u->str;
5790
5791 for (p = self->str; p < e; p++)
5792 if (*p == '\t') {
5793 if (tabsize > 0) {
5794 i = tabsize - (j % tabsize);
5795 j += i;
5796 while (i--)
5797 *q++ = ' ';
5798 }
5799 }
5800 else {
5801 j++;
5802 *q++ = *p;
5803 if (*p == '\n' || *p == '\r')
5804 j = 0;
5805 }
5806
5807 return (PyObject*) u;
5808}
5809
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005810PyDoc_STRVAR(find__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005811"S.find(sub [,start [,end]]) -> int\n\
5812\n\
5813Return the lowest index in S where substring sub is found,\n\
5814such that sub is contained within s[start,end]. Optional\n\
5815arguments start and end are interpreted as in slice notation.\n\
5816\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005817Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005818
5819static PyObject *
5820unicode_find(PyUnicodeObject *self, PyObject *args)
5821{
Thomas Wouters477c8d52006-05-27 19:21:47 +00005822 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005823 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005824 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005825 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005826
Guido van Rossumb8872e62000-05-09 14:14:27 +00005827 if (!PyArg_ParseTuple(args, "O|O&O&:find", &substring,
5828 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005829 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005830 substring = PyUnicode_FromObject(substring);
5831 if (!substring)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005832 return NULL;
5833
Thomas Wouters477c8d52006-05-27 19:21:47 +00005834 result = stringlib_find_slice(
5835 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
5836 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
5837 start, end
5838 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00005839
5840 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00005841
5842 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005843}
5844
5845static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +00005846unicode_getitem(PyUnicodeObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005847{
5848 if (index < 0 || index >= self->length) {
5849 PyErr_SetString(PyExc_IndexError, "string index out of range");
5850 return NULL;
5851 }
5852
5853 return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1);
5854}
5855
5856static long
Guido van Rossum09dc34f2007-05-04 04:17:33 +00005857unicode_hash(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005858{
Guido van Rossum09dc34f2007-05-04 04:17:33 +00005859 /* Since Unicode objects compare equal to their UTF-8 string
5860 counterparts, we hash the UTF-8 string. */
5861 PyObject *v = _PyUnicode_AsDefaultEncodedString(self, NULL);
5862 return PyObject_Hash(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005863}
5864
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005865PyDoc_STRVAR(index__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005866"S.index(sub [,start [,end]]) -> int\n\
5867\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005868Like S.find() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005869
5870static PyObject *
5871unicode_index(PyUnicodeObject *self, PyObject *args)
5872{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005873 Py_ssize_t result;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005874 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005875 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005876 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005877
Guido van Rossumb8872e62000-05-09 14:14:27 +00005878 if (!PyArg_ParseTuple(args, "O|O&O&:index", &substring,
5879 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005880 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005881 substring = PyUnicode_FromObject(substring);
5882 if (!substring)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005883 return NULL;
5884
Thomas Wouters477c8d52006-05-27 19:21:47 +00005885 result = stringlib_find_slice(
5886 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
5887 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
5888 start, end
5889 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00005890
5891 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00005892
Guido van Rossumd57fd912000-03-10 22:53:23 +00005893 if (result < 0) {
5894 PyErr_SetString(PyExc_ValueError, "substring not found");
5895 return NULL;
5896 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00005897
Martin v. Löwis18e16552006-02-15 17:27:45 +00005898 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005899}
5900
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005901PyDoc_STRVAR(islower__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005902"S.islower() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005903\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00005904Return True if all cased characters in S are lowercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005905at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005906
5907static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005908unicode_islower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005909{
5910 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5911 register const Py_UNICODE *e;
5912 int cased;
5913
Guido van Rossumd57fd912000-03-10 22:53:23 +00005914 /* Shortcut for single character strings */
5915 if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005916 return PyBool_FromLong(Py_UNICODE_ISLOWER(*p));
Guido van Rossumd57fd912000-03-10 22:53:23 +00005917
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005918 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00005919 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005920 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005921
Guido van Rossumd57fd912000-03-10 22:53:23 +00005922 e = p + PyUnicode_GET_SIZE(self);
5923 cased = 0;
5924 for (; p < e; p++) {
5925 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00005926
Guido van Rossumd57fd912000-03-10 22:53:23 +00005927 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005928 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005929 else if (!cased && Py_UNICODE_ISLOWER(ch))
5930 cased = 1;
5931 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00005932 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005933}
5934
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005935PyDoc_STRVAR(isupper__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005936"S.isupper() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005937\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00005938Return True if all cased characters in S are uppercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005939at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005940
5941static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005942unicode_isupper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005943{
5944 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5945 register const Py_UNICODE *e;
5946 int cased;
5947
Guido van Rossumd57fd912000-03-10 22:53:23 +00005948 /* Shortcut for single character strings */
5949 if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005950 return PyBool_FromLong(Py_UNICODE_ISUPPER(*p) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005951
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005952 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00005953 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005954 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005955
Guido van Rossumd57fd912000-03-10 22:53:23 +00005956 e = p + PyUnicode_GET_SIZE(self);
5957 cased = 0;
5958 for (; p < e; p++) {
5959 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00005960
Guido van Rossumd57fd912000-03-10 22:53:23 +00005961 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005962 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005963 else if (!cased && Py_UNICODE_ISUPPER(ch))
5964 cased = 1;
5965 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00005966 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005967}
5968
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005969PyDoc_STRVAR(istitle__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005970"S.istitle() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005971\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00005972Return True if S is a titlecased string and there is at least one\n\
5973character in S, i.e. upper- and titlecase characters may only\n\
5974follow uncased characters and lowercase characters only cased ones.\n\
5975Return False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005976
5977static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005978unicode_istitle(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005979{
5980 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5981 register const Py_UNICODE *e;
5982 int cased, previous_is_cased;
5983
Guido van Rossumd57fd912000-03-10 22:53:23 +00005984 /* Shortcut for single character strings */
5985 if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005986 return PyBool_FromLong((Py_UNICODE_ISTITLE(*p) != 0) ||
5987 (Py_UNICODE_ISUPPER(*p) != 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +00005988
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005989 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00005990 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005991 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005992
Guido van Rossumd57fd912000-03-10 22:53:23 +00005993 e = p + PyUnicode_GET_SIZE(self);
5994 cased = 0;
5995 previous_is_cased = 0;
5996 for (; p < e; p++) {
5997 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00005998
Guido van Rossumd57fd912000-03-10 22:53:23 +00005999 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
6000 if (previous_is_cased)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006001 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006002 previous_is_cased = 1;
6003 cased = 1;
6004 }
6005 else if (Py_UNICODE_ISLOWER(ch)) {
6006 if (!previous_is_cased)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006007 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006008 previous_is_cased = 1;
6009 cased = 1;
6010 }
6011 else
6012 previous_is_cased = 0;
6013 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006014 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006015}
6016
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006017PyDoc_STRVAR(isspace__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006018"S.isspace() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006019\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006020Return True if all characters in S are whitespace\n\
6021and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006022
6023static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006024unicode_isspace(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006025{
6026 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6027 register const Py_UNICODE *e;
6028
Guido van Rossumd57fd912000-03-10 22:53:23 +00006029 /* Shortcut for single character strings */
6030 if (PyUnicode_GET_SIZE(self) == 1 &&
6031 Py_UNICODE_ISSPACE(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006032 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006033
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006034 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006035 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006036 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006037
Guido van Rossumd57fd912000-03-10 22:53:23 +00006038 e = p + PyUnicode_GET_SIZE(self);
6039 for (; p < e; p++) {
6040 if (!Py_UNICODE_ISSPACE(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006041 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006042 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006043 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006044}
6045
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006046PyDoc_STRVAR(isalpha__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006047"S.isalpha() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006048\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006049Return True if all characters in S are alphabetic\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006050and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006051
6052static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006053unicode_isalpha(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006054{
6055 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6056 register const Py_UNICODE *e;
6057
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006058 /* Shortcut for single character strings */
6059 if (PyUnicode_GET_SIZE(self) == 1 &&
6060 Py_UNICODE_ISALPHA(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006061 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006062
6063 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006064 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006065 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006066
6067 e = p + PyUnicode_GET_SIZE(self);
6068 for (; p < e; p++) {
6069 if (!Py_UNICODE_ISALPHA(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006070 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006071 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006072 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006073}
6074
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006075PyDoc_STRVAR(isalnum__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006076"S.isalnum() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006077\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006078Return True if all characters in S are alphanumeric\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006079and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006080
6081static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006082unicode_isalnum(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006083{
6084 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6085 register const Py_UNICODE *e;
6086
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006087 /* Shortcut for single character strings */
6088 if (PyUnicode_GET_SIZE(self) == 1 &&
6089 Py_UNICODE_ISALNUM(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006090 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006091
6092 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006093 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006094 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006095
6096 e = p + PyUnicode_GET_SIZE(self);
6097 for (; p < e; p++) {
6098 if (!Py_UNICODE_ISALNUM(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006099 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006100 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006101 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006102}
6103
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006104PyDoc_STRVAR(isdecimal__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006105"S.isdecimal() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006106\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00006107Return True if there are only decimal characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006108False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006109
6110static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006111unicode_isdecimal(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006112{
6113 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6114 register const Py_UNICODE *e;
6115
Guido van Rossumd57fd912000-03-10 22:53:23 +00006116 /* Shortcut for single character strings */
6117 if (PyUnicode_GET_SIZE(self) == 1 &&
6118 Py_UNICODE_ISDECIMAL(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006119 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006120
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006121 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006122 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006123 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006124
Guido van Rossumd57fd912000-03-10 22:53:23 +00006125 e = p + PyUnicode_GET_SIZE(self);
6126 for (; p < e; p++) {
6127 if (!Py_UNICODE_ISDECIMAL(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006128 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006129 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006130 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006131}
6132
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006133PyDoc_STRVAR(isdigit__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006134"S.isdigit() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006135\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006136Return True if all characters in S are digits\n\
6137and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006138
6139static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006140unicode_isdigit(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006141{
6142 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6143 register const Py_UNICODE *e;
6144
Guido van Rossumd57fd912000-03-10 22:53:23 +00006145 /* Shortcut for single character strings */
6146 if (PyUnicode_GET_SIZE(self) == 1 &&
6147 Py_UNICODE_ISDIGIT(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006148 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006149
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006150 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006151 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006152 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006153
Guido van Rossumd57fd912000-03-10 22:53:23 +00006154 e = p + PyUnicode_GET_SIZE(self);
6155 for (; p < e; p++) {
6156 if (!Py_UNICODE_ISDIGIT(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006157 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006158 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006159 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006160}
6161
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006162PyDoc_STRVAR(isnumeric__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006163"S.isnumeric() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006164\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00006165Return True if there are only numeric characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006166False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006167
6168static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006169unicode_isnumeric(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006170{
6171 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6172 register const Py_UNICODE *e;
6173
Guido van Rossumd57fd912000-03-10 22:53:23 +00006174 /* Shortcut for single character strings */
6175 if (PyUnicode_GET_SIZE(self) == 1 &&
6176 Py_UNICODE_ISNUMERIC(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006177 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006178
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006179 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006180 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006181 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006182
Guido van Rossumd57fd912000-03-10 22:53:23 +00006183 e = p + PyUnicode_GET_SIZE(self);
6184 for (; p < e; p++) {
6185 if (!Py_UNICODE_ISNUMERIC(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006186 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006187 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006188 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006189}
6190
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006191PyDoc_STRVAR(join__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006192"S.join(sequence) -> unicode\n\
6193\n\
6194Return a string which is the concatenation of the strings in the\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006195sequence. The separator between elements is S.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006196
6197static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006198unicode_join(PyObject *self, PyObject *data)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006199{
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006200 return PyUnicode_Join(self, data);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006201}
6202
Martin v. Löwis18e16552006-02-15 17:27:45 +00006203static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +00006204unicode_length(PyUnicodeObject *self)
6205{
6206 return self->length;
6207}
6208
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006209PyDoc_STRVAR(ljust__doc__,
Hye-Shik Chang974ed7c2004-06-02 16:49:17 +00006210"S.ljust(width[, fillchar]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006211\n\
6212Return S left justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006213done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006214
6215static PyObject *
6216unicode_ljust(PyUnicodeObject *self, PyObject *args)
6217{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006218 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006219 Py_UNICODE fillchar = ' ';
6220
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006221 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006222 return NULL;
6223
Tim Peters7a29bd52001-09-12 03:03:31 +00006224 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006225 Py_INCREF(self);
6226 return (PyObject*) self;
6227 }
6228
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006229 return (PyObject*) pad(self, 0, width - self->length, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006230}
6231
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006232PyDoc_STRVAR(lower__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006233"S.lower() -> unicode\n\
6234\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006235Return a copy of the string S converted to lowercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006236
6237static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006238unicode_lower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006239{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006240 return fixup(self, fixlower);
6241}
6242
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006243#define LEFTSTRIP 0
6244#define RIGHTSTRIP 1
6245#define BOTHSTRIP 2
6246
6247/* Arrays indexed by above */
6248static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
6249
6250#define STRIPNAME(i) (stripformat[i]+3)
6251
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006252/* externally visible for str.strip(unicode) */
6253PyObject *
6254_PyUnicode_XStrip(PyUnicodeObject *self, int striptype, PyObject *sepobj)
6255{
6256 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
Martin v. Löwis18e16552006-02-15 17:27:45 +00006257 Py_ssize_t len = PyUnicode_GET_SIZE(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006258 Py_UNICODE *sep = PyUnicode_AS_UNICODE(sepobj);
Martin v. Löwis18e16552006-02-15 17:27:45 +00006259 Py_ssize_t seplen = PyUnicode_GET_SIZE(sepobj);
6260 Py_ssize_t i, j;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006261
Thomas Wouters477c8d52006-05-27 19:21:47 +00006262 BLOOM_MASK sepmask = make_bloom_mask(sep, seplen);
6263
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006264 i = 0;
6265 if (striptype != RIGHTSTRIP) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00006266 while (i < len && BLOOM_MEMBER(sepmask, s[i], sep, seplen)) {
6267 i++;
6268 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006269 }
6270
6271 j = len;
6272 if (striptype != LEFTSTRIP) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00006273 do {
6274 j--;
6275 } while (j >= i && BLOOM_MEMBER(sepmask, s[j], sep, seplen));
6276 j++;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006277 }
6278
6279 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00006280 Py_INCREF(self);
6281 return (PyObject*)self;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006282 }
6283 else
Thomas Wouters477c8d52006-05-27 19:21:47 +00006284 return PyUnicode_FromUnicode(s+i, j-i);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006285}
6286
Guido van Rossumd57fd912000-03-10 22:53:23 +00006287
6288static PyObject *
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006289do_strip(PyUnicodeObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006290{
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006291 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
Martin v. Löwis18e16552006-02-15 17:27:45 +00006292 Py_ssize_t len = PyUnicode_GET_SIZE(self), i, j;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006293
6294 i = 0;
6295 if (striptype != RIGHTSTRIP) {
6296 while (i < len && Py_UNICODE_ISSPACE(s[i])) {
6297 i++;
6298 }
6299 }
6300
6301 j = len;
6302 if (striptype != LEFTSTRIP) {
6303 do {
6304 j--;
6305 } while (j >= i && Py_UNICODE_ISSPACE(s[j]));
6306 j++;
6307 }
6308
6309 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
6310 Py_INCREF(self);
6311 return (PyObject*)self;
6312 }
6313 else
6314 return PyUnicode_FromUnicode(s+i, j-i);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006315}
6316
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006317
6318static PyObject *
6319do_argstrip(PyUnicodeObject *self, int striptype, PyObject *args)
6320{
6321 PyObject *sep = NULL;
6322
6323 if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
6324 return NULL;
6325
6326 if (sep != NULL && sep != Py_None) {
6327 if (PyUnicode_Check(sep))
6328 return _PyUnicode_XStrip(self, striptype, sep);
6329 else if (PyString_Check(sep)) {
6330 PyObject *res;
6331 sep = PyUnicode_FromObject(sep);
6332 if (sep==NULL)
6333 return NULL;
6334 res = _PyUnicode_XStrip(self, striptype, sep);
6335 Py_DECREF(sep);
6336 return res;
6337 }
6338 else {
6339 PyErr_Format(PyExc_TypeError,
6340 "%s arg must be None, unicode or str",
6341 STRIPNAME(striptype));
6342 return NULL;
6343 }
6344 }
6345
6346 return do_strip(self, striptype);
6347}
6348
6349
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006350PyDoc_STRVAR(strip__doc__,
Neal Norwitzffe33b72003-04-10 22:35:32 +00006351"S.strip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006352\n\
6353Return a copy of the string S with leading and trailing\n\
6354whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00006355If chars is given and not None, remove characters in chars instead.\n\
6356If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006357
6358static PyObject *
6359unicode_strip(PyUnicodeObject *self, PyObject *args)
6360{
6361 if (PyTuple_GET_SIZE(args) == 0)
6362 return do_strip(self, BOTHSTRIP); /* Common case */
6363 else
6364 return do_argstrip(self, BOTHSTRIP, args);
6365}
6366
6367
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006368PyDoc_STRVAR(lstrip__doc__,
Neal Norwitzffe33b72003-04-10 22:35:32 +00006369"S.lstrip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006370\n\
6371Return a copy of the string S with leading whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00006372If chars is given and not None, remove characters in chars instead.\n\
6373If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006374
6375static PyObject *
6376unicode_lstrip(PyUnicodeObject *self, PyObject *args)
6377{
6378 if (PyTuple_GET_SIZE(args) == 0)
6379 return do_strip(self, LEFTSTRIP); /* Common case */
6380 else
6381 return do_argstrip(self, LEFTSTRIP, args);
6382}
6383
6384
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006385PyDoc_STRVAR(rstrip__doc__,
Neal Norwitzffe33b72003-04-10 22:35:32 +00006386"S.rstrip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006387\n\
6388Return a copy of the string S with trailing whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00006389If chars is given and not None, remove characters in chars instead.\n\
6390If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006391
6392static PyObject *
6393unicode_rstrip(PyUnicodeObject *self, PyObject *args)
6394{
6395 if (PyTuple_GET_SIZE(args) == 0)
6396 return do_strip(self, RIGHTSTRIP); /* Common case */
6397 else
6398 return do_argstrip(self, RIGHTSTRIP, args);
6399}
6400
6401
Guido van Rossumd57fd912000-03-10 22:53:23 +00006402static PyObject*
Martin v. Löwis18e16552006-02-15 17:27:45 +00006403unicode_repeat(PyUnicodeObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006404{
6405 PyUnicodeObject *u;
6406 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006407 Py_ssize_t nchars;
Tim Peters8f422462000-09-09 06:13:41 +00006408 size_t nbytes;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006409
6410 if (len < 0)
6411 len = 0;
6412
Tim Peters7a29bd52001-09-12 03:03:31 +00006413 if (len == 1 && PyUnicode_CheckExact(str)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006414 /* no repeat, return original string */
6415 Py_INCREF(str);
6416 return (PyObject*) str;
6417 }
Tim Peters8f422462000-09-09 06:13:41 +00006418
6419 /* ensure # of chars needed doesn't overflow int and # of bytes
6420 * needed doesn't overflow size_t
6421 */
6422 nchars = len * str->length;
6423 if (len && nchars / len != str->length) {
6424 PyErr_SetString(PyExc_OverflowError,
6425 "repeated string is too long");
6426 return NULL;
6427 }
6428 nbytes = (nchars + 1) * sizeof(Py_UNICODE);
6429 if (nbytes / sizeof(Py_UNICODE) != (size_t)(nchars + 1)) {
6430 PyErr_SetString(PyExc_OverflowError,
6431 "repeated string is too long");
6432 return NULL;
6433 }
6434 u = _PyUnicode_New(nchars);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006435 if (!u)
6436 return NULL;
6437
6438 p = u->str;
6439
Thomas Wouters477c8d52006-05-27 19:21:47 +00006440 if (str->length == 1 && len > 0) {
6441 Py_UNICODE_FILL(p, str->str[0], len);
6442 } else {
6443 Py_ssize_t done = 0; /* number of characters copied this far */
6444 if (done < nchars) {
6445 Py_UNICODE_COPY(p, str->str, str->length);
6446 done = str->length;
6447 }
6448 while (done < nchars) {
6449 int n = (done <= nchars-done) ? done : nchars-done;
6450 Py_UNICODE_COPY(p+done, p, n);
6451 done += n;
6452 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006453 }
6454
6455 return (PyObject*) u;
6456}
6457
6458PyObject *PyUnicode_Replace(PyObject *obj,
6459 PyObject *subobj,
6460 PyObject *replobj,
Martin v. Löwis18e16552006-02-15 17:27:45 +00006461 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006462{
6463 PyObject *self;
6464 PyObject *str1;
6465 PyObject *str2;
6466 PyObject *result;
6467
6468 self = PyUnicode_FromObject(obj);
6469 if (self == NULL)
6470 return NULL;
6471 str1 = PyUnicode_FromObject(subobj);
6472 if (str1 == NULL) {
6473 Py_DECREF(self);
6474 return NULL;
6475 }
6476 str2 = PyUnicode_FromObject(replobj);
6477 if (str2 == NULL) {
6478 Py_DECREF(self);
6479 Py_DECREF(str1);
6480 return NULL;
6481 }
Tim Petersced69f82003-09-16 20:30:58 +00006482 result = replace((PyUnicodeObject *)self,
6483 (PyUnicodeObject *)str1,
6484 (PyUnicodeObject *)str2,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006485 maxcount);
6486 Py_DECREF(self);
6487 Py_DECREF(str1);
6488 Py_DECREF(str2);
6489 return result;
6490}
6491
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006492PyDoc_STRVAR(replace__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006493"S.replace (old, new[, maxsplit]) -> unicode\n\
6494\n\
6495Return a copy of S with all occurrences of substring\n\
6496old replaced by new. If the optional argument maxsplit is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006497given, only the first maxsplit occurrences are replaced.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006498
6499static PyObject*
6500unicode_replace(PyUnicodeObject *self, PyObject *args)
6501{
6502 PyUnicodeObject *str1;
6503 PyUnicodeObject *str2;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006504 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006505 PyObject *result;
6506
Martin v. Löwis18e16552006-02-15 17:27:45 +00006507 if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006508 return NULL;
6509 str1 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str1);
6510 if (str1 == NULL)
6511 return NULL;
6512 str2 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str2);
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00006513 if (str2 == NULL) {
6514 Py_DECREF(str1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006515 return NULL;
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00006516 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006517
6518 result = replace(self, str1, str2, maxcount);
6519
6520 Py_DECREF(str1);
6521 Py_DECREF(str2);
6522 return result;
6523}
6524
6525static
6526PyObject *unicode_repr(PyObject *unicode)
6527{
Walter Dörwald79e913e2007-05-12 11:08:06 +00006528 PyObject *repr;
6529 char *p;
6530 Py_UNICODE *s = PyUnicode_AS_UNICODE(unicode);
6531 Py_ssize_t size = PyUnicode_GET_SIZE(unicode);
6532
6533 /* XXX(nnorwitz): rather than over-allocating, it would be
6534 better to choose a different scheme. Perhaps scan the
6535 first N-chars of the string and allocate based on that size.
6536 */
6537 /* Initial allocation is based on the longest-possible unichr
6538 escape.
6539
6540 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
6541 unichr, so in this case it's the longest unichr escape. In
6542 narrow (UTF-16) builds this is five chars per source unichr
6543 since there are two unichrs in the surrogate pair, so in narrow
6544 (UTF-16) builds it's not the longest unichr escape.
6545
6546 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
6547 so in the narrow (UTF-16) build case it's the longest unichr
6548 escape.
6549 */
6550
6551 repr = PyString_FromStringAndSize(NULL,
6552 2 /* quotes */
6553#ifdef Py_UNICODE_WIDE
6554 + 10*size
6555#else
6556 + 6*size
6557#endif
6558 + 1);
6559 if (repr == NULL)
6560 return NULL;
6561
6562 p = PyString_AS_STRING(repr);
6563
6564 /* Add quote */
6565 *p++ = (findchar(s, size, '\'') &&
6566 !findchar(s, size, '"')) ? '"' : '\'';
6567 while (size-- > 0) {
6568 Py_UNICODE ch = *s++;
6569
6570 /* Escape quotes and backslashes */
6571 if ((ch == (Py_UNICODE) PyString_AS_STRING(repr)[0]) || (ch == '\\')) {
6572 *p++ = '\\';
6573 *p++ = (char) ch;
6574 continue;
6575 }
6576
6577#ifdef Py_UNICODE_WIDE
6578 /* Map 21-bit characters to '\U00xxxxxx' */
6579 else if (ch >= 0x10000) {
6580 *p++ = '\\';
6581 *p++ = 'U';
6582 *p++ = hexdigits[(ch >> 28) & 0x0000000F];
6583 *p++ = hexdigits[(ch >> 24) & 0x0000000F];
6584 *p++ = hexdigits[(ch >> 20) & 0x0000000F];
6585 *p++ = hexdigits[(ch >> 16) & 0x0000000F];
6586 *p++ = hexdigits[(ch >> 12) & 0x0000000F];
6587 *p++ = hexdigits[(ch >> 8) & 0x0000000F];
6588 *p++ = hexdigits[(ch >> 4) & 0x0000000F];
6589 *p++ = hexdigits[ch & 0x0000000F];
6590 continue;
6591 }
6592#else
6593 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
6594 else if (ch >= 0xD800 && ch < 0xDC00) {
6595 Py_UNICODE ch2;
6596 Py_UCS4 ucs;
6597
6598 ch2 = *s++;
6599 size--;
6600 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
6601 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
6602 *p++ = '\\';
6603 *p++ = 'U';
6604 *p++ = hexdigits[(ucs >> 28) & 0x0000000F];
6605 *p++ = hexdigits[(ucs >> 24) & 0x0000000F];
6606 *p++ = hexdigits[(ucs >> 20) & 0x0000000F];
6607 *p++ = hexdigits[(ucs >> 16) & 0x0000000F];
6608 *p++ = hexdigits[(ucs >> 12) & 0x0000000F];
6609 *p++ = hexdigits[(ucs >> 8) & 0x0000000F];
6610 *p++ = hexdigits[(ucs >> 4) & 0x0000000F];
6611 *p++ = hexdigits[ucs & 0x0000000F];
6612 continue;
6613 }
6614 /* Fall through: isolated surrogates are copied as-is */
6615 s--;
6616 size++;
6617 }
6618#endif
6619
6620 /* Map 16-bit characters to '\uxxxx' */
6621 if (ch >= 256) {
6622 *p++ = '\\';
6623 *p++ = 'u';
6624 *p++ = hexdigits[(ch >> 12) & 0x000F];
6625 *p++ = hexdigits[(ch >> 8) & 0x000F];
6626 *p++ = hexdigits[(ch >> 4) & 0x000F];
6627 *p++ = hexdigits[ch & 0x000F];
6628 }
6629
6630 /* Map special whitespace to '\t', \n', '\r' */
6631 else if (ch == '\t') {
6632 *p++ = '\\';
6633 *p++ = 't';
6634 }
6635 else if (ch == '\n') {
6636 *p++ = '\\';
6637 *p++ = 'n';
6638 }
6639 else if (ch == '\r') {
6640 *p++ = '\\';
6641 *p++ = 'r';
6642 }
6643
6644 /* Map non-printable US ASCII to '\xhh' */
6645 else if (ch < ' ' || ch >= 0x7F) {
6646 *p++ = '\\';
6647 *p++ = 'x';
6648 *p++ = hexdigits[(ch >> 4) & 0x000F];
6649 *p++ = hexdigits[ch & 0x000F];
6650 }
6651
6652 /* Copy everything else as-is */
6653 else
6654 *p++ = (char) ch;
6655 }
6656 /* Add quote */
6657 *p++ = PyString_AS_STRING(repr)[0];
6658
6659 *p = '\0';
6660 _PyString_Resize(&repr, p - PyString_AS_STRING(repr));
6661 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006662}
6663
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006664PyDoc_STRVAR(rfind__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006665"S.rfind(sub [,start [,end]]) -> int\n\
6666\n\
6667Return the highest index in S where substring sub is found,\n\
6668such that sub is contained within s[start,end]. Optional\n\
6669arguments start and end are interpreted as in slice notation.\n\
6670\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006671Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006672
6673static PyObject *
6674unicode_rfind(PyUnicodeObject *self, PyObject *args)
6675{
Thomas Wouters477c8d52006-05-27 19:21:47 +00006676 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006677 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006678 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006679 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006680
Guido van Rossumb8872e62000-05-09 14:14:27 +00006681 if (!PyArg_ParseTuple(args, "O|O&O&:rfind", &substring,
6682 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006683 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006684 substring = PyUnicode_FromObject(substring);
6685 if (!substring)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006686 return NULL;
6687
Thomas Wouters477c8d52006-05-27 19:21:47 +00006688 result = stringlib_rfind_slice(
6689 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
6690 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
6691 start, end
6692 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006693
6694 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00006695
6696 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006697}
6698
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006699PyDoc_STRVAR(rindex__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006700"S.rindex(sub [,start [,end]]) -> int\n\
6701\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006702Like S.rfind() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006703
6704static PyObject *
6705unicode_rindex(PyUnicodeObject *self, PyObject *args)
6706{
Thomas Wouters477c8d52006-05-27 19:21:47 +00006707 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006708 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006709 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006710 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006711
Guido van Rossumb8872e62000-05-09 14:14:27 +00006712 if (!PyArg_ParseTuple(args, "O|O&O&:rindex", &substring,
6713 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006714 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006715 substring = PyUnicode_FromObject(substring);
6716 if (!substring)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006717 return NULL;
6718
Thomas Wouters477c8d52006-05-27 19:21:47 +00006719 result = stringlib_rfind_slice(
6720 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
6721 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
6722 start, end
6723 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006724
6725 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00006726
Guido van Rossumd57fd912000-03-10 22:53:23 +00006727 if (result < 0) {
6728 PyErr_SetString(PyExc_ValueError, "substring not found");
6729 return NULL;
6730 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00006731 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006732}
6733
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006734PyDoc_STRVAR(rjust__doc__,
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006735"S.rjust(width[, fillchar]) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006736\n\
6737Return S right justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006738done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006739
6740static PyObject *
6741unicode_rjust(PyUnicodeObject *self, PyObject *args)
6742{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006743 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006744 Py_UNICODE fillchar = ' ';
6745
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006746 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006747 return NULL;
6748
Tim Peters7a29bd52001-09-12 03:03:31 +00006749 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006750 Py_INCREF(self);
6751 return (PyObject*) self;
6752 }
6753
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006754 return (PyObject*) pad(self, width - self->length, 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006755}
6756
Guido van Rossumd57fd912000-03-10 22:53:23 +00006757static PyObject*
Martin v. Löwis18e16552006-02-15 17:27:45 +00006758unicode_slice(PyUnicodeObject *self, Py_ssize_t start, Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006759{
6760 /* standard clamping */
6761 if (start < 0)
6762 start = 0;
6763 if (end < 0)
6764 end = 0;
6765 if (end > self->length)
6766 end = self->length;
Tim Peters7a29bd52001-09-12 03:03:31 +00006767 if (start == 0 && end == self->length && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006768 /* full slice, return original string */
6769 Py_INCREF(self);
6770 return (PyObject*) self;
6771 }
6772 if (start > end)
6773 start = end;
6774 /* copy slice */
6775 return (PyObject*) PyUnicode_FromUnicode(self->str + start,
6776 end - start);
6777}
6778
6779PyObject *PyUnicode_Split(PyObject *s,
6780 PyObject *sep,
Martin v. Löwis18e16552006-02-15 17:27:45 +00006781 Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006782{
6783 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00006784
Guido van Rossumd57fd912000-03-10 22:53:23 +00006785 s = PyUnicode_FromObject(s);
6786 if (s == NULL)
6787 return NULL;
6788 if (sep != NULL) {
6789 sep = PyUnicode_FromObject(sep);
6790 if (sep == NULL) {
6791 Py_DECREF(s);
6792 return NULL;
6793 }
6794 }
6795
6796 result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
6797
6798 Py_DECREF(s);
6799 Py_XDECREF(sep);
6800 return result;
6801}
6802
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006803PyDoc_STRVAR(split__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006804"S.split([sep [,maxsplit]]) -> list of strings\n\
6805\n\
6806Return a list of the words in S, using sep as the\n\
6807delimiter string. If maxsplit is given, at most maxsplit\n\
Thomas Hellerca0d2cb2004-09-15 11:41:32 +00006808splits are done. If sep is not specified or is None,\n\
Walter Dörwald782afc52004-09-14 09:40:45 +00006809any whitespace string is a separator.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006810
6811static PyObject*
6812unicode_split(PyUnicodeObject *self, PyObject *args)
6813{
6814 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006815 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006816
Martin v. Löwis18e16552006-02-15 17:27:45 +00006817 if (!PyArg_ParseTuple(args, "|On:split", &substring, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006818 return NULL;
6819
6820 if (substring == Py_None)
6821 return split(self, NULL, maxcount);
6822 else if (PyUnicode_Check(substring))
6823 return split(self, (PyUnicodeObject *)substring, maxcount);
6824 else
6825 return PyUnicode_Split((PyObject *)self, substring, maxcount);
6826}
6827
Thomas Wouters477c8d52006-05-27 19:21:47 +00006828PyObject *
6829PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
6830{
6831 PyObject* str_obj;
6832 PyObject* sep_obj;
6833 PyObject* out;
6834
6835 str_obj = PyUnicode_FromObject(str_in);
6836 if (!str_obj)
6837 return NULL;
6838 sep_obj = PyUnicode_FromObject(sep_in);
6839 if (!sep_obj) {
6840 Py_DECREF(str_obj);
6841 return NULL;
6842 }
6843
6844 out = stringlib_partition(
6845 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
6846 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
6847 );
6848
6849 Py_DECREF(sep_obj);
6850 Py_DECREF(str_obj);
6851
6852 return out;
6853}
6854
6855
6856PyObject *
6857PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
6858{
6859 PyObject* str_obj;
6860 PyObject* sep_obj;
6861 PyObject* out;
6862
6863 str_obj = PyUnicode_FromObject(str_in);
6864 if (!str_obj)
6865 return NULL;
6866 sep_obj = PyUnicode_FromObject(sep_in);
6867 if (!sep_obj) {
6868 Py_DECREF(str_obj);
6869 return NULL;
6870 }
6871
6872 out = stringlib_rpartition(
6873 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
6874 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
6875 );
6876
6877 Py_DECREF(sep_obj);
6878 Py_DECREF(str_obj);
6879
6880 return out;
6881}
6882
6883PyDoc_STRVAR(partition__doc__,
6884"S.partition(sep) -> (head, sep, tail)\n\
6885\n\
6886Searches for the separator sep in S, and returns the part before it,\n\
6887the separator itself, and the part after it. If the separator is not\n\
6888found, returns S and two empty strings.");
6889
6890static PyObject*
6891unicode_partition(PyUnicodeObject *self, PyObject *separator)
6892{
6893 return PyUnicode_Partition((PyObject *)self, separator);
6894}
6895
6896PyDoc_STRVAR(rpartition__doc__,
Thomas Wouters89f507f2006-12-13 04:49:30 +00006897"S.rpartition(sep) -> (tail, sep, head)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00006898\n\
6899Searches for the separator sep in S, starting at the end of S, and returns\n\
6900the part before it, the separator itself, and the part after it. If the\n\
Thomas Wouters89f507f2006-12-13 04:49:30 +00006901separator is not found, returns two empty strings and S.");
Thomas Wouters477c8d52006-05-27 19:21:47 +00006902
6903static PyObject*
6904unicode_rpartition(PyUnicodeObject *self, PyObject *separator)
6905{
6906 return PyUnicode_RPartition((PyObject *)self, separator);
6907}
6908
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006909PyObject *PyUnicode_RSplit(PyObject *s,
6910 PyObject *sep,
Martin v. Löwis18e16552006-02-15 17:27:45 +00006911 Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006912{
6913 PyObject *result;
6914
6915 s = PyUnicode_FromObject(s);
6916 if (s == NULL)
6917 return NULL;
6918 if (sep != NULL) {
6919 sep = PyUnicode_FromObject(sep);
6920 if (sep == NULL) {
6921 Py_DECREF(s);
6922 return NULL;
6923 }
6924 }
6925
6926 result = rsplit((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
6927
6928 Py_DECREF(s);
6929 Py_XDECREF(sep);
6930 return result;
6931}
6932
6933PyDoc_STRVAR(rsplit__doc__,
6934"S.rsplit([sep [,maxsplit]]) -> list of strings\n\
6935\n\
6936Return a list of the words in S, using sep as the\n\
6937delimiter string, starting at the end of the string and\n\
6938working to the front. If maxsplit is given, at most maxsplit\n\
6939splits are done. If sep is not specified, any whitespace string\n\
6940is a separator.");
6941
6942static PyObject*
6943unicode_rsplit(PyUnicodeObject *self, PyObject *args)
6944{
6945 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006946 Py_ssize_t maxcount = -1;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006947
Martin v. Löwis18e16552006-02-15 17:27:45 +00006948 if (!PyArg_ParseTuple(args, "|On:rsplit", &substring, &maxcount))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006949 return NULL;
6950
6951 if (substring == Py_None)
6952 return rsplit(self, NULL, maxcount);
6953 else if (PyUnicode_Check(substring))
6954 return rsplit(self, (PyUnicodeObject *)substring, maxcount);
6955 else
6956 return PyUnicode_RSplit((PyObject *)self, substring, maxcount);
6957}
6958
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006959PyDoc_STRVAR(splitlines__doc__,
Guido van Rossum86662912000-04-11 15:38:46 +00006960"S.splitlines([keepends]]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006961\n\
6962Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +00006963Line breaks are not included in the resulting list unless keepends\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006964is given and true.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006965
6966static PyObject*
6967unicode_splitlines(PyUnicodeObject *self, PyObject *args)
6968{
Guido van Rossum86662912000-04-11 15:38:46 +00006969 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006970
Guido van Rossum86662912000-04-11 15:38:46 +00006971 if (!PyArg_ParseTuple(args, "|i:splitlines", &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006972 return NULL;
6973
Guido van Rossum86662912000-04-11 15:38:46 +00006974 return PyUnicode_Splitlines((PyObject *)self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006975}
6976
6977static
Guido van Rossumf15a29f2007-05-04 00:41:39 +00006978PyObject *unicode_str(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006979{
Guido van Rossumf15a29f2007-05-04 00:41:39 +00006980 PyObject *res = _PyUnicode_AsDefaultEncodedString(self, NULL);
6981 Py_XINCREF(res);
6982 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006983}
6984
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006985PyDoc_STRVAR(swapcase__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006986"S.swapcase() -> unicode\n\
6987\n\
6988Return a copy of S with uppercase characters converted to lowercase\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006989and vice versa.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006990
6991static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006992unicode_swapcase(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006993{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006994 return fixup(self, fixswapcase);
6995}
6996
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006997PyDoc_STRVAR(translate__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006998"S.translate(table) -> unicode\n\
6999\n\
7000Return a copy of the string S, where all characters have been mapped\n\
7001through the given translation table, which must be a mapping of\n\
Walter Dörwald5c1ee172002-09-04 20:31:32 +00007002Unicode ordinals to Unicode ordinals, Unicode strings or None.\n\
7003Unmapped characters are left untouched. Characters mapped to None\n\
7004are deleted.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007005
7006static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007007unicode_translate(PyUnicodeObject *self, PyObject *table)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007008{
Tim Petersced69f82003-09-16 20:30:58 +00007009 return PyUnicode_TranslateCharmap(self->str,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007010 self->length,
Tim Petersced69f82003-09-16 20:30:58 +00007011 table,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007012 "ignore");
7013}
7014
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007015PyDoc_STRVAR(upper__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007016"S.upper() -> unicode\n\
7017\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007018Return a copy of S converted to uppercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007019
7020static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007021unicode_upper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007022{
Guido van Rossumd57fd912000-03-10 22:53:23 +00007023 return fixup(self, fixupper);
7024}
7025
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007026PyDoc_STRVAR(zfill__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007027"S.zfill(width) -> unicode\n\
7028\n\
7029Pad a numeric string x with zeros on the left, to fill a field\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007030of the specified width. The string x is never truncated.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007031
7032static PyObject *
7033unicode_zfill(PyUnicodeObject *self, PyObject *args)
7034{
Martin v. Löwis18e16552006-02-15 17:27:45 +00007035 Py_ssize_t fill;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007036 PyUnicodeObject *u;
7037
Martin v. Löwis18e16552006-02-15 17:27:45 +00007038 Py_ssize_t width;
7039 if (!PyArg_ParseTuple(args, "n:zfill", &width))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007040 return NULL;
7041
7042 if (self->length >= width) {
Walter Dörwald0fe940c2002-04-15 18:42:15 +00007043 if (PyUnicode_CheckExact(self)) {
7044 Py_INCREF(self);
7045 return (PyObject*) self;
7046 }
7047 else
7048 return PyUnicode_FromUnicode(
7049 PyUnicode_AS_UNICODE(self),
7050 PyUnicode_GET_SIZE(self)
7051 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007052 }
7053
7054 fill = width - self->length;
7055
7056 u = pad(self, fill, 0, '0');
7057
Walter Dörwald068325e2002-04-15 13:36:47 +00007058 if (u == NULL)
7059 return NULL;
7060
Guido van Rossumd57fd912000-03-10 22:53:23 +00007061 if (u->str[fill] == '+' || u->str[fill] == '-') {
7062 /* move sign to beginning of string */
7063 u->str[0] = u->str[fill];
7064 u->str[fill] = '0';
7065 }
7066
7067 return (PyObject*) u;
7068}
Guido van Rossumd57fd912000-03-10 22:53:23 +00007069
7070#if 0
7071static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007072unicode_freelistsize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007073{
Guido van Rossumd57fd912000-03-10 22:53:23 +00007074 return PyInt_FromLong(unicode_freelist_size);
7075}
7076#endif
7077
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007078PyDoc_STRVAR(startswith__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00007079"S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007080\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00007081Return True if S starts with the specified prefix, False otherwise.\n\
7082With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007083With optional end, stop comparing S at that position.\n\
7084prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007085
7086static PyObject *
7087unicode_startswith(PyUnicodeObject *self,
7088 PyObject *args)
7089{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007090 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007091 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007092 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007093 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007094 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007095
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007096 if (!PyArg_ParseTuple(args, "O|O&O&:startswith", &subobj,
Guido van Rossumb8872e62000-05-09 14:14:27 +00007097 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007098 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007099 if (PyTuple_Check(subobj)) {
7100 Py_ssize_t i;
7101 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
7102 substring = (PyUnicodeObject *)PyUnicode_FromObject(
7103 PyTuple_GET_ITEM(subobj, i));
7104 if (substring == NULL)
7105 return NULL;
7106 result = tailmatch(self, substring, start, end, -1);
7107 Py_DECREF(substring);
7108 if (result) {
7109 Py_RETURN_TRUE;
7110 }
7111 }
7112 /* nothing matched */
7113 Py_RETURN_FALSE;
7114 }
7115 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007116 if (substring == NULL)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007117 return NULL;
7118 result = tailmatch(self, substring, start, end, -1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007119 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007120 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007121}
7122
7123
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007124PyDoc_STRVAR(endswith__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00007125"S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007126\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00007127Return True if S ends with the specified suffix, False otherwise.\n\
7128With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007129With optional end, stop comparing S at that position.\n\
7130suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007131
7132static PyObject *
7133unicode_endswith(PyUnicodeObject *self,
7134 PyObject *args)
7135{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007136 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007137 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007138 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007139 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007140 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007141
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007142 if (!PyArg_ParseTuple(args, "O|O&O&:endswith", &subobj,
7143 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007144 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007145 if (PyTuple_Check(subobj)) {
7146 Py_ssize_t i;
7147 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
7148 substring = (PyUnicodeObject *)PyUnicode_FromObject(
7149 PyTuple_GET_ITEM(subobj, i));
7150 if (substring == NULL)
7151 return NULL;
7152 result = tailmatch(self, substring, start, end, +1);
7153 Py_DECREF(substring);
7154 if (result) {
7155 Py_RETURN_TRUE;
7156 }
7157 }
7158 Py_RETURN_FALSE;
7159 }
7160 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007161 if (substring == NULL)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007162 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007163
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007164 result = tailmatch(self, substring, start, end, +1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007165 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007166 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007167}
7168
7169
Guido van Rossum5d9113d2003-01-29 17:58:45 +00007170
7171static PyObject *
7172unicode_getnewargs(PyUnicodeObject *v)
7173{
7174 return Py_BuildValue("(u#)", v->str, v->length);
7175}
7176
7177
Guido van Rossumd57fd912000-03-10 22:53:23 +00007178static PyMethodDef unicode_methods[] = {
7179
7180 /* Order is according to common usage: often used methods should
7181 appear first, since lookup is done sequentially. */
7182
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007183 {"encode", (PyCFunction) unicode_encode, METH_VARARGS, encode__doc__},
7184 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
7185 {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007186 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007187 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
7188 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
7189 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
7190 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
7191 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
7192 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
7193 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +00007194 {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007195 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
7196 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
7197 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007198 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00007199 {"decode", (PyCFunction) unicode_decode, METH_VARARGS, decode__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007200/* {"maketrans", (PyCFunction) unicode_maketrans, METH_VARARGS, maketrans__doc__}, */
7201 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
7202 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
7203 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007204 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +00007205 {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007206 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS, splitlines__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007207 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007208 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
7209 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
7210 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
7211 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
7212 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
7213 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
7214 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
7215 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
7216 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
7217 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
7218 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
7219 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
7220 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
7221 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007222 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
Walter Dörwald068325e2002-04-15 13:36:47 +00007223#if 0
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007224 {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
Guido van Rossumd57fd912000-03-10 22:53:23 +00007225#endif
7226
7227#if 0
7228 /* This one is just used for debugging the implementation. */
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007229 {"freelistsize", (PyCFunction) unicode_freelistsize, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00007230#endif
7231
Guido van Rossum5d9113d2003-01-29 17:58:45 +00007232 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00007233 {NULL, NULL}
7234};
7235
Neil Schemenauerce30bc92002-11-18 16:10:18 +00007236static PyObject *
7237unicode_mod(PyObject *v, PyObject *w)
7238{
7239 if (!PyUnicode_Check(v)) {
7240 Py_INCREF(Py_NotImplemented);
7241 return Py_NotImplemented;
7242 }
7243 return PyUnicode_Format(v, w);
7244}
7245
7246static PyNumberMethods unicode_as_number = {
7247 0, /*nb_add*/
7248 0, /*nb_subtract*/
7249 0, /*nb_multiply*/
Neil Schemenauerce30bc92002-11-18 16:10:18 +00007250 unicode_mod, /*nb_remainder*/
7251};
7252
Guido van Rossumd57fd912000-03-10 22:53:23 +00007253static PySequenceMethods unicode_as_sequence = {
Martin v. Löwis18e16552006-02-15 17:27:45 +00007254 (lenfunc) unicode_length, /* sq_length */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007255 PyUnicode_Concat, /* sq_concat */
Martin v. Löwis18e16552006-02-15 17:27:45 +00007256 (ssizeargfunc) unicode_repeat, /* sq_repeat */
7257 (ssizeargfunc) unicode_getitem, /* sq_item */
7258 (ssizessizeargfunc) unicode_slice, /* sq_slice */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007259 0, /* sq_ass_item */
7260 0, /* sq_ass_slice */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007261 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007262};
7263
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007264static PyObject*
7265unicode_subscript(PyUnicodeObject* self, PyObject* item)
7266{
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00007267 if (PyIndex_Check(item)) {
7268 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007269 if (i == -1 && PyErr_Occurred())
7270 return NULL;
7271 if (i < 0)
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007272 i += PyUnicode_GET_SIZE(self);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007273 return unicode_getitem(self, i);
7274 } else if (PySlice_Check(item)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00007275 Py_ssize_t start, stop, step, slicelength, cur, i;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007276 Py_UNICODE* source_buf;
7277 Py_UNICODE* result_buf;
7278 PyObject* result;
7279
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007280 if (PySlice_GetIndicesEx((PySliceObject*)item, PyUnicode_GET_SIZE(self),
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007281 &start, &stop, &step, &slicelength) < 0) {
7282 return NULL;
7283 }
7284
7285 if (slicelength <= 0) {
7286 return PyUnicode_FromUnicode(NULL, 0);
7287 } else {
7288 source_buf = PyUnicode_AS_UNICODE((PyObject*)self);
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007289 result_buf = (Py_UNICODE *)PyMem_MALLOC(slicelength*
7290 sizeof(Py_UNICODE));
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007291
7292 if (result_buf == NULL)
7293 return PyErr_NoMemory();
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007294
7295 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
7296 result_buf[i] = source_buf[cur];
7297 }
Tim Petersced69f82003-09-16 20:30:58 +00007298
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007299 result = PyUnicode_FromUnicode(result_buf, slicelength);
7300 PyMem_FREE(result_buf);
7301 return result;
7302 }
7303 } else {
7304 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
7305 return NULL;
7306 }
7307}
7308
7309static PyMappingMethods unicode_as_mapping = {
Martin v. Löwis18e16552006-02-15 17:27:45 +00007310 (lenfunc)unicode_length, /* mp_length */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007311 (binaryfunc)unicode_subscript, /* mp_subscript */
7312 (objobjargproc)0, /* mp_ass_subscript */
7313};
7314
Martin v. Löwis18e16552006-02-15 17:27:45 +00007315static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +00007316unicode_buffer_getreadbuf(PyUnicodeObject *self,
Martin v. Löwis18e16552006-02-15 17:27:45 +00007317 Py_ssize_t index,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007318 const void **ptr)
7319{
7320 if (index != 0) {
7321 PyErr_SetString(PyExc_SystemError,
7322 "accessing non-existent unicode segment");
7323 return -1;
7324 }
7325 *ptr = (void *) self->str;
7326 return PyUnicode_GET_DATA_SIZE(self);
7327}
7328
Martin v. Löwis18e16552006-02-15 17:27:45 +00007329static Py_ssize_t
7330unicode_buffer_getwritebuf(PyUnicodeObject *self, Py_ssize_t index,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007331 const void **ptr)
7332{
7333 PyErr_SetString(PyExc_TypeError,
Neal Norwitz20e72132002-06-13 21:25:17 +00007334 "cannot use unicode as modifiable buffer");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007335 return -1;
7336}
7337
7338static int
7339unicode_buffer_getsegcount(PyUnicodeObject *self,
Martin v. Löwis18e16552006-02-15 17:27:45 +00007340 Py_ssize_t *lenp)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007341{
7342 if (lenp)
7343 *lenp = PyUnicode_GET_DATA_SIZE(self);
7344 return 1;
7345}
7346
Martin v. Löwiseb079f12006-02-16 14:32:27 +00007347static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +00007348unicode_buffer_getcharbuf(PyUnicodeObject *self,
Martin v. Löwis18e16552006-02-15 17:27:45 +00007349 Py_ssize_t index,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007350 const void **ptr)
7351{
7352 PyObject *str;
Tim Petersced69f82003-09-16 20:30:58 +00007353
Guido van Rossumd57fd912000-03-10 22:53:23 +00007354 if (index != 0) {
7355 PyErr_SetString(PyExc_SystemError,
7356 "accessing non-existent unicode segment");
7357 return -1;
7358 }
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00007359 str = _PyUnicode_AsDefaultEncodedString((PyObject *)self, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007360 if (str == NULL)
7361 return -1;
7362 *ptr = (void *) PyString_AS_STRING(str);
7363 return PyString_GET_SIZE(str);
7364}
7365
7366/* Helpers for PyUnicode_Format() */
7367
7368static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +00007369getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007370{
Martin v. Löwis18e16552006-02-15 17:27:45 +00007371 Py_ssize_t argidx = *p_argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007372 if (argidx < arglen) {
7373 (*p_argidx)++;
7374 if (arglen < 0)
7375 return args;
7376 else
7377 return PyTuple_GetItem(args, argidx);
7378 }
7379 PyErr_SetString(PyExc_TypeError,
7380 "not enough arguments for format string");
7381 return NULL;
7382}
7383
7384#define F_LJUST (1<<0)
7385#define F_SIGN (1<<1)
7386#define F_BLANK (1<<2)
7387#define F_ALT (1<<3)
7388#define F_ZERO (1<<4)
7389
Martin v. Löwis18e16552006-02-15 17:27:45 +00007390static Py_ssize_t
Neal Norwitzfc76d632006-01-10 06:03:13 +00007391strtounicode(Py_UNICODE *buffer, const char *charbuffer)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007392{
Martin v. Löwis18e16552006-02-15 17:27:45 +00007393 register Py_ssize_t i;
7394 Py_ssize_t len = strlen(charbuffer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007395 for (i = len - 1; i >= 0; i--)
7396 buffer[i] = (Py_UNICODE) charbuffer[i];
7397
Guido van Rossumd57fd912000-03-10 22:53:23 +00007398 return len;
7399}
7400
Neal Norwitzfc76d632006-01-10 06:03:13 +00007401static int
7402doubletounicode(Py_UNICODE *buffer, size_t len, const char *format, double x)
7403{
Tim Peters15231542006-02-16 01:08:01 +00007404 Py_ssize_t result;
7405
Neal Norwitzfc76d632006-01-10 06:03:13 +00007406 PyOS_ascii_formatd((char *)buffer, len, format, x);
Tim Peters15231542006-02-16 01:08:01 +00007407 result = strtounicode(buffer, (char *)buffer);
7408 return Py_SAFE_DOWNCAST(result, Py_ssize_t, int);
Neal Norwitzfc76d632006-01-10 06:03:13 +00007409}
7410
7411static int
7412longtounicode(Py_UNICODE *buffer, size_t len, const char *format, long x)
7413{
Tim Peters15231542006-02-16 01:08:01 +00007414 Py_ssize_t result;
7415
Neal Norwitzfc76d632006-01-10 06:03:13 +00007416 PyOS_snprintf((char *)buffer, len, format, x);
Tim Peters15231542006-02-16 01:08:01 +00007417 result = strtounicode(buffer, (char *)buffer);
7418 return Py_SAFE_DOWNCAST(result, Py_ssize_t, int);
Neal Norwitzfc76d632006-01-10 06:03:13 +00007419}
7420
Guido van Rossum078151d2002-08-11 04:24:12 +00007421/* XXX To save some code duplication, formatfloat/long/int could have been
7422 shared with stringobject.c, converting from 8-bit to Unicode after the
7423 formatting is done. */
7424
Guido van Rossumd57fd912000-03-10 22:53:23 +00007425static int
7426formatfloat(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007427 size_t buflen,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007428 int flags,
7429 int prec,
7430 int type,
7431 PyObject *v)
7432{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007433 /* fmt = '%#.' + `prec` + `type`
7434 worst case length = 3 + 10 (len of INT_MAX) + 1 = 14 (use 20)*/
Guido van Rossumd57fd912000-03-10 22:53:23 +00007435 char fmt[20];
7436 double x;
Tim Petersced69f82003-09-16 20:30:58 +00007437
Guido van Rossumd57fd912000-03-10 22:53:23 +00007438 x = PyFloat_AsDouble(v);
7439 if (x == -1.0 && PyErr_Occurred())
7440 return -1;
7441 if (prec < 0)
7442 prec = 6;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007443 if (type == 'f' && (fabs(x) / 1e25) >= 1e25)
7444 type = 'g';
Marc-André Lemburg79f57832002-12-29 19:44:06 +00007445 /* Worst case length calc to ensure no buffer overrun:
7446
7447 'g' formats:
7448 fmt = %#.<prec>g
7449 buf = '-' + [0-9]*prec + '.' + 'e+' + (longest exp
7450 for any double rep.)
7451 len = 1 + prec + 1 + 2 + 5 = 9 + prec
7452
7453 'f' formats:
7454 buf = '-' + [0-9]*x + '.' + [0-9]*prec (with x < 50)
7455 len = 1 + 50 + 1 + prec = 52 + prec
7456
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007457 If prec=0 the effective precision is 1 (the leading digit is
Tim Petersced69f82003-09-16 20:30:58 +00007458 always given), therefore increase the length by one.
Marc-André Lemburg79f57832002-12-29 19:44:06 +00007459
7460 */
7461 if ((type == 'g' && buflen <= (size_t)10 + (size_t)prec) ||
7462 (type == 'f' && buflen <= (size_t)53 + (size_t)prec)) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007463 PyErr_SetString(PyExc_OverflowError,
Marc-André Lemburg79f57832002-12-29 19:44:06 +00007464 "formatted float is too long (precision too large?)");
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007465 return -1;
7466 }
Marc-André Lemburg79f57832002-12-29 19:44:06 +00007467 PyOS_snprintf(fmt, sizeof(fmt), "%%%s.%d%c",
7468 (flags&F_ALT) ? "#" : "",
7469 prec, type);
Neal Norwitzfc76d632006-01-10 06:03:13 +00007470 return doubletounicode(buf, buflen, fmt, x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007471}
7472
Tim Peters38fd5b62000-09-21 05:43:11 +00007473static PyObject*
7474formatlong(PyObject *val, int flags, int prec, int type)
7475{
7476 char *buf;
7477 int i, len;
7478 PyObject *str; /* temporary string object. */
7479 PyUnicodeObject *result;
7480
7481 str = _PyString_FormatLong(val, flags, prec, type, &buf, &len);
7482 if (!str)
7483 return NULL;
7484 result = _PyUnicode_New(len);
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00007485 if (!result) {
7486 Py_DECREF(str);
7487 return NULL;
7488 }
Tim Peters38fd5b62000-09-21 05:43:11 +00007489 for (i = 0; i < len; i++)
7490 result->str[i] = buf[i];
7491 result->str[len] = 0;
7492 Py_DECREF(str);
7493 return (PyObject*)result;
7494}
7495
Guido van Rossumd57fd912000-03-10 22:53:23 +00007496static int
7497formatint(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007498 size_t buflen,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007499 int flags,
7500 int prec,
7501 int type,
7502 PyObject *v)
7503{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007504 /* fmt = '%#.' + `prec` + 'l' + `type`
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00007505 * worst case length = 3 + 19 (worst len of INT_MAX on 64-bit machine)
7506 * + 1 + 1
7507 * = 24
7508 */
Tim Peters38fd5b62000-09-21 05:43:11 +00007509 char fmt[64]; /* plenty big enough! */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00007510 char *sign;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007511 long x;
7512
7513 x = PyInt_AsLong(v);
7514 if (x == -1 && PyErr_Occurred())
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00007515 return -1;
Guido van Rossum6c9e1302003-11-29 23:52:13 +00007516 if (x < 0 && type == 'u') {
7517 type = 'd';
Guido van Rossum078151d2002-08-11 04:24:12 +00007518 }
Guido van Rossum6c9e1302003-11-29 23:52:13 +00007519 if (x < 0 && (type == 'x' || type == 'X' || type == 'o'))
7520 sign = "-";
7521 else
7522 sign = "";
Guido van Rossumd57fd912000-03-10 22:53:23 +00007523 if (prec < 0)
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00007524 prec = 1;
7525
Guido van Rossum6c9e1302003-11-29 23:52:13 +00007526 /* buf = '+'/'-'/'' + '0'/'0x'/'' + '[0-9]'*max(prec, len(x in octal))
7527 * worst case buf = '-0x' + [0-9]*prec, where prec >= 11
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00007528 */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00007529 if (buflen <= 14 || buflen <= (size_t)3 + (size_t)prec) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007530 PyErr_SetString(PyExc_OverflowError,
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00007531 "formatted integer is too long (precision too large?)");
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007532 return -1;
7533 }
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00007534
7535 if ((flags & F_ALT) &&
7536 (type == 'x' || type == 'X')) {
Tim Petersced69f82003-09-16 20:30:58 +00007537 /* When converting under %#x or %#X, there are a number
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00007538 * of issues that cause pain:
7539 * - when 0 is being converted, the C standard leaves off
7540 * the '0x' or '0X', which is inconsistent with other
7541 * %#x/%#X conversions and inconsistent with Python's
7542 * hex() function
7543 * - there are platforms that violate the standard and
7544 * convert 0 with the '0x' or '0X'
7545 * (Metrowerks, Compaq Tru64)
7546 * - there are platforms that give '0x' when converting
Tim Petersced69f82003-09-16 20:30:58 +00007547 * under %#X, but convert 0 in accordance with the
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00007548 * standard (OS/2 EMX)
Tim Petersced69f82003-09-16 20:30:58 +00007549 *
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00007550 * We can achieve the desired consistency by inserting our
7551 * own '0x' or '0X' prefix, and substituting %x/%X in place
7552 * of %#x/%#X.
7553 *
7554 * Note that this is the same approach as used in
7555 * formatint() in stringobject.c
Andrew MacIntyrec4874392002-02-26 11:36:35 +00007556 */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00007557 PyOS_snprintf(fmt, sizeof(fmt), "%s0%c%%.%dl%c",
7558 sign, type, prec, type);
Andrew MacIntyrec4874392002-02-26 11:36:35 +00007559 }
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00007560 else {
Guido van Rossum6c9e1302003-11-29 23:52:13 +00007561 PyOS_snprintf(fmt, sizeof(fmt), "%s%%%s.%dl%c",
7562 sign, (flags&F_ALT) ? "#" : "",
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00007563 prec, type);
Tim Petersb3d8d1f2001-04-28 05:38:26 +00007564 }
Guido van Rossum6c9e1302003-11-29 23:52:13 +00007565 if (sign[0])
Neal Norwitzfc76d632006-01-10 06:03:13 +00007566 return longtounicode(buf, buflen, fmt, -x);
Guido van Rossum6c9e1302003-11-29 23:52:13 +00007567 else
Neal Norwitzfc76d632006-01-10 06:03:13 +00007568 return longtounicode(buf, buflen, fmt, x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007569}
7570
7571static int
7572formatchar(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007573 size_t buflen,
7574 PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007575{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007576 /* presume that the buffer is at least 2 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00007577 if (PyUnicode_Check(v)) {
7578 if (PyUnicode_GET_SIZE(v) != 1)
7579 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007580 buf[0] = PyUnicode_AS_UNICODE(v)[0];
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00007581 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007582
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00007583 else if (PyString_Check(v)) {
Tim Petersced69f82003-09-16 20:30:58 +00007584 if (PyString_GET_SIZE(v) != 1)
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00007585 goto onError;
7586 buf[0] = (Py_UNICODE)PyString_AS_STRING(v)[0];
7587 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007588
7589 else {
7590 /* Integer input truncated to a character */
7591 long x;
7592 x = PyInt_AsLong(v);
7593 if (x == -1 && PyErr_Occurred())
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00007594 goto onError;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00007595#ifdef Py_UNICODE_WIDE
7596 if (x < 0 || x > 0x10ffff) {
Walter Dörwald44f527f2003-04-02 16:37:24 +00007597 PyErr_SetString(PyExc_OverflowError,
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00007598 "%c arg not in range(0x110000) "
7599 "(wide Python build)");
7600 return -1;
7601 }
7602#else
7603 if (x < 0 || x > 0xffff) {
Walter Dörwald44f527f2003-04-02 16:37:24 +00007604 PyErr_SetString(PyExc_OverflowError,
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00007605 "%c arg not in range(0x10000) "
7606 "(narrow Python build)");
7607 return -1;
7608 }
7609#endif
7610 buf[0] = (Py_UNICODE) x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007611 }
7612 buf[1] = '\0';
7613 return 1;
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00007614
7615 onError:
7616 PyErr_SetString(PyExc_TypeError,
7617 "%c requires int or char");
7618 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007619}
7620
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007621/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
7622
7623 FORMATBUFLEN is the length of the buffer in which the floats, ints, &
7624 chars are formatted. XXX This is a magic number. Each formatting
7625 routine does bounds checking to ensure no overflow, but a better
7626 solution may be to malloc a buffer of appropriate size for each
7627 format. For now, the current solution is sufficient.
7628*/
7629#define FORMATBUFLEN (size_t)120
7630
Guido van Rossumd57fd912000-03-10 22:53:23 +00007631PyObject *PyUnicode_Format(PyObject *format,
7632 PyObject *args)
7633{
7634 Py_UNICODE *fmt, *res;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007635 Py_ssize_t fmtcnt, rescnt, reslen, arglen, argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007636 int args_owned = 0;
7637 PyUnicodeObject *result = NULL;
7638 PyObject *dict = NULL;
7639 PyObject *uformat;
Tim Petersced69f82003-09-16 20:30:58 +00007640
Guido van Rossumd57fd912000-03-10 22:53:23 +00007641 if (format == NULL || args == NULL) {
7642 PyErr_BadInternalCall();
7643 return NULL;
7644 }
7645 uformat = PyUnicode_FromObject(format);
Fred Drakee4315f52000-05-09 19:53:39 +00007646 if (uformat == NULL)
7647 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007648 fmt = PyUnicode_AS_UNICODE(uformat);
7649 fmtcnt = PyUnicode_GET_SIZE(uformat);
7650
7651 reslen = rescnt = fmtcnt + 100;
7652 result = _PyUnicode_New(reslen);
7653 if (result == NULL)
7654 goto onError;
7655 res = PyUnicode_AS_UNICODE(result);
7656
7657 if (PyTuple_Check(args)) {
7658 arglen = PyTuple_Size(args);
7659 argidx = 0;
7660 }
7661 else {
7662 arglen = -1;
7663 argidx = -2;
7664 }
Neal Norwitz80a1bf42002-11-12 23:01:12 +00007665 if (args->ob_type->tp_as_mapping && !PyTuple_Check(args) &&
7666 !PyObject_TypeCheck(args, &PyBaseString_Type))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007667 dict = args;
7668
7669 while (--fmtcnt >= 0) {
7670 if (*fmt != '%') {
7671 if (--rescnt < 0) {
7672 rescnt = fmtcnt + 100;
7673 reslen += rescnt;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00007674 if (_PyUnicode_Resize(&result, reslen) < 0)
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00007675 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007676 res = PyUnicode_AS_UNICODE(result) + reslen - rescnt;
7677 --rescnt;
7678 }
7679 *res++ = *fmt++;
7680 }
7681 else {
7682 /* Got a format specifier */
7683 int flags = 0;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007684 Py_ssize_t width = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007685 int prec = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007686 Py_UNICODE c = '\0';
7687 Py_UNICODE fill;
7688 PyObject *v = NULL;
7689 PyObject *temp = NULL;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007690 Py_UNICODE *pbuf;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007691 Py_UNICODE sign;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007692 Py_ssize_t len;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007693 Py_UNICODE formatbuf[FORMATBUFLEN]; /* For format{float,int,char}() */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007694
7695 fmt++;
7696 if (*fmt == '(') {
7697 Py_UNICODE *keystart;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007698 Py_ssize_t keylen;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007699 PyObject *key;
7700 int pcount = 1;
7701
7702 if (dict == NULL) {
7703 PyErr_SetString(PyExc_TypeError,
Tim Petersced69f82003-09-16 20:30:58 +00007704 "format requires a mapping");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007705 goto onError;
7706 }
7707 ++fmt;
7708 --fmtcnt;
7709 keystart = fmt;
7710 /* Skip over balanced parentheses */
7711 while (pcount > 0 && --fmtcnt >= 0) {
7712 if (*fmt == ')')
7713 --pcount;
7714 else if (*fmt == '(')
7715 ++pcount;
7716 fmt++;
7717 }
7718 keylen = fmt - keystart - 1;
7719 if (fmtcnt < 0 || pcount > 0) {
7720 PyErr_SetString(PyExc_ValueError,
7721 "incomplete format key");
7722 goto onError;
7723 }
Marc-André Lemburg72f82132001-11-20 15:18:49 +00007724#if 0
Fred Drakee4315f52000-05-09 19:53:39 +00007725 /* keys are converted to strings using UTF-8 and
Guido van Rossumd57fd912000-03-10 22:53:23 +00007726 then looked up since Python uses strings to hold
7727 variables names etc. in its namespaces and we
Fred Drakee4315f52000-05-09 19:53:39 +00007728 wouldn't want to break common idioms. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007729 key = PyUnicode_EncodeUTF8(keystart,
7730 keylen,
7731 NULL);
Marc-André Lemburg72f82132001-11-20 15:18:49 +00007732#else
7733 key = PyUnicode_FromUnicode(keystart, keylen);
7734#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00007735 if (key == NULL)
7736 goto onError;
7737 if (args_owned) {
7738 Py_DECREF(args);
7739 args_owned = 0;
7740 }
7741 args = PyObject_GetItem(dict, key);
7742 Py_DECREF(key);
7743 if (args == NULL) {
7744 goto onError;
7745 }
7746 args_owned = 1;
7747 arglen = -1;
7748 argidx = -2;
7749 }
7750 while (--fmtcnt >= 0) {
7751 switch (c = *fmt++) {
7752 case '-': flags |= F_LJUST; continue;
7753 case '+': flags |= F_SIGN; continue;
7754 case ' ': flags |= F_BLANK; continue;
7755 case '#': flags |= F_ALT; continue;
7756 case '0': flags |= F_ZERO; continue;
7757 }
7758 break;
7759 }
7760 if (c == '*') {
7761 v = getnextarg(args, arglen, &argidx);
7762 if (v == NULL)
7763 goto onError;
7764 if (!PyInt_Check(v)) {
7765 PyErr_SetString(PyExc_TypeError,
7766 "* wants int");
7767 goto onError;
7768 }
7769 width = PyInt_AsLong(v);
Guido van Rossumddefaf32007-01-14 03:31:43 +00007770 if (width == -1 && PyErr_Occurred())
7771 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007772 if (width < 0) {
7773 flags |= F_LJUST;
7774 width = -width;
7775 }
7776 if (--fmtcnt >= 0)
7777 c = *fmt++;
7778 }
7779 else if (c >= '0' && c <= '9') {
7780 width = c - '0';
7781 while (--fmtcnt >= 0) {
7782 c = *fmt++;
7783 if (c < '0' || c > '9')
7784 break;
7785 if ((width*10) / 10 != width) {
7786 PyErr_SetString(PyExc_ValueError,
7787 "width too big");
7788 goto onError;
7789 }
7790 width = width*10 + (c - '0');
7791 }
7792 }
7793 if (c == '.') {
7794 prec = 0;
7795 if (--fmtcnt >= 0)
7796 c = *fmt++;
7797 if (c == '*') {
7798 v = getnextarg(args, arglen, &argidx);
7799 if (v == NULL)
7800 goto onError;
7801 if (!PyInt_Check(v)) {
7802 PyErr_SetString(PyExc_TypeError,
7803 "* wants int");
7804 goto onError;
7805 }
7806 prec = PyInt_AsLong(v);
Guido van Rossumddefaf32007-01-14 03:31:43 +00007807 if (prec == -1 && PyErr_Occurred())
7808 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007809 if (prec < 0)
7810 prec = 0;
7811 if (--fmtcnt >= 0)
7812 c = *fmt++;
7813 }
7814 else if (c >= '0' && c <= '9') {
7815 prec = c - '0';
7816 while (--fmtcnt >= 0) {
7817 c = Py_CHARMASK(*fmt++);
7818 if (c < '0' || c > '9')
7819 break;
7820 if ((prec*10) / 10 != prec) {
7821 PyErr_SetString(PyExc_ValueError,
7822 "prec too big");
7823 goto onError;
7824 }
7825 prec = prec*10 + (c - '0');
7826 }
7827 }
7828 } /* prec */
7829 if (fmtcnt >= 0) {
7830 if (c == 'h' || c == 'l' || c == 'L') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007831 if (--fmtcnt >= 0)
7832 c = *fmt++;
7833 }
7834 }
7835 if (fmtcnt < 0) {
7836 PyErr_SetString(PyExc_ValueError,
7837 "incomplete format");
7838 goto onError;
7839 }
7840 if (c != '%') {
7841 v = getnextarg(args, arglen, &argidx);
7842 if (v == NULL)
7843 goto onError;
7844 }
7845 sign = 0;
7846 fill = ' ';
7847 switch (c) {
7848
7849 case '%':
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007850 pbuf = formatbuf;
7851 /* presume that buffer length is at least 1 */
7852 pbuf[0] = '%';
Guido van Rossumd57fd912000-03-10 22:53:23 +00007853 len = 1;
7854 break;
7855
7856 case 's':
7857 case 'r':
7858 if (PyUnicode_Check(v) && c == 's') {
7859 temp = v;
7860 Py_INCREF(temp);
7861 }
7862 else {
7863 PyObject *unicode;
7864 if (c == 's')
Marc-André Lemburgd25c6502004-07-23 16:13:25 +00007865 temp = PyObject_Unicode(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007866 else
7867 temp = PyObject_Repr(v);
7868 if (temp == NULL)
7869 goto onError;
Marc-André Lemburgd25c6502004-07-23 16:13:25 +00007870 if (PyUnicode_Check(temp))
7871 /* nothing to do */;
7872 else if (PyString_Check(temp)) {
7873 /* convert to string to Unicode */
Thomas Woutersa96affe2006-03-12 00:29:36 +00007874 unicode = PyUnicode_Decode(PyString_AS_STRING(temp),
Guido van Rossumd57fd912000-03-10 22:53:23 +00007875 PyString_GET_SIZE(temp),
Thomas Woutersa96affe2006-03-12 00:29:36 +00007876 NULL,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007877 "strict");
Thomas Woutersa96affe2006-03-12 00:29:36 +00007878 Py_DECREF(temp);
7879 temp = unicode;
7880 if (temp == NULL)
7881 goto onError;
7882 }
Marc-André Lemburgd25c6502004-07-23 16:13:25 +00007883 else {
7884 Py_DECREF(temp);
7885 PyErr_SetString(PyExc_TypeError,
7886 "%s argument has non-string str()");
7887 goto onError;
7888 }
7889 }
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007890 pbuf = PyUnicode_AS_UNICODE(temp);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007891 len = PyUnicode_GET_SIZE(temp);
7892 if (prec >= 0 && len > prec)
7893 len = prec;
7894 break;
7895
7896 case 'i':
7897 case 'd':
7898 case 'u':
7899 case 'o':
7900 case 'x':
7901 case 'X':
7902 if (c == 'i')
7903 c = 'd';
Tim Petersa3a3a032000-11-30 05:22:44 +00007904 if (PyLong_Check(v)) {
Tim Peters38fd5b62000-09-21 05:43:11 +00007905 temp = formatlong(v, flags, prec, c);
7906 if (!temp)
7907 goto onError;
7908 pbuf = PyUnicode_AS_UNICODE(temp);
7909 len = PyUnicode_GET_SIZE(temp);
Tim Peters38fd5b62000-09-21 05:43:11 +00007910 sign = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007911 }
Tim Peters38fd5b62000-09-21 05:43:11 +00007912 else {
7913 pbuf = formatbuf;
7914 len = formatint(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
7915 flags, prec, c, v);
7916 if (len < 0)
7917 goto onError;
Guido van Rossum6c9e1302003-11-29 23:52:13 +00007918 sign = 1;
Tim Peters38fd5b62000-09-21 05:43:11 +00007919 }
7920 if (flags & F_ZERO)
7921 fill = '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00007922 break;
7923
7924 case 'e':
7925 case 'E':
7926 case 'f':
Raymond Hettinger9bfe5332003-08-27 04:55:52 +00007927 case 'F':
Guido van Rossumd57fd912000-03-10 22:53:23 +00007928 case 'g':
7929 case 'G':
Raymond Hettinger9bfe5332003-08-27 04:55:52 +00007930 if (c == 'F')
7931 c = 'f';
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007932 pbuf = formatbuf;
7933 len = formatfloat(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
7934 flags, prec, c, v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007935 if (len < 0)
7936 goto onError;
7937 sign = 1;
Tim Peters38fd5b62000-09-21 05:43:11 +00007938 if (flags & F_ZERO)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007939 fill = '0';
7940 break;
7941
7942 case 'c':
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007943 pbuf = formatbuf;
7944 len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007945 if (len < 0)
7946 goto onError;
7947 break;
7948
7949 default:
7950 PyErr_Format(PyExc_ValueError,
Andrew M. Kuchling6ca89172000-12-15 13:07:46 +00007951 "unsupported format character '%c' (0x%x) "
Thomas Wouters89f507f2006-12-13 04:49:30 +00007952 "at index %zd",
Tim Petersced69f82003-09-16 20:30:58 +00007953 (31<=c && c<=126) ? (char)c : '?',
Marc-André Lemburg24e53b62002-09-24 09:32:14 +00007954 (int)c,
Thomas Wouters89f507f2006-12-13 04:49:30 +00007955 (Py_ssize_t)(fmt - 1 -
7956 PyUnicode_AS_UNICODE(uformat)));
Guido van Rossumd57fd912000-03-10 22:53:23 +00007957 goto onError;
7958 }
7959 if (sign) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007960 if (*pbuf == '-' || *pbuf == '+') {
7961 sign = *pbuf++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007962 len--;
7963 }
7964 else if (flags & F_SIGN)
7965 sign = '+';
7966 else if (flags & F_BLANK)
7967 sign = ' ';
7968 else
7969 sign = 0;
7970 }
7971 if (width < len)
7972 width = len;
Guido van Rossum049cd6b2002-10-11 00:43:48 +00007973 if (rescnt - (sign != 0) < width) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007974 reslen -= rescnt;
7975 rescnt = width + fmtcnt + 100;
7976 reslen += rescnt;
Guido van Rossum049cd6b2002-10-11 00:43:48 +00007977 if (reslen < 0) {
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00007978 Py_XDECREF(temp);
Thomas Woutersa96affe2006-03-12 00:29:36 +00007979 PyErr_NoMemory();
7980 goto onError;
Guido van Rossum049cd6b2002-10-11 00:43:48 +00007981 }
Thomas Woutersa96affe2006-03-12 00:29:36 +00007982 if (_PyUnicode_Resize(&result, reslen) < 0) {
7983 Py_XDECREF(temp);
7984 goto onError;
7985 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007986 res = PyUnicode_AS_UNICODE(result)
7987 + reslen - rescnt;
7988 }
7989 if (sign) {
7990 if (fill != ' ')
7991 *res++ = sign;
7992 rescnt--;
7993 if (width > len)
7994 width--;
7995 }
Tim Peters38fd5b62000-09-21 05:43:11 +00007996 if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
7997 assert(pbuf[0] == '0');
Tim Petersfff53252001-04-12 18:38:48 +00007998 assert(pbuf[1] == c);
7999 if (fill != ' ') {
8000 *res++ = *pbuf++;
8001 *res++ = *pbuf++;
Tim Peters38fd5b62000-09-21 05:43:11 +00008002 }
Tim Petersfff53252001-04-12 18:38:48 +00008003 rescnt -= 2;
8004 width -= 2;
8005 if (width < 0)
8006 width = 0;
8007 len -= 2;
Tim Peters38fd5b62000-09-21 05:43:11 +00008008 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008009 if (width > len && !(flags & F_LJUST)) {
8010 do {
8011 --rescnt;
8012 *res++ = fill;
8013 } while (--width > len);
8014 }
Tim Peters38fd5b62000-09-21 05:43:11 +00008015 if (fill == ' ') {
8016 if (sign)
8017 *res++ = sign;
Tim Petersfff53252001-04-12 18:38:48 +00008018 if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
Tim Peters38fd5b62000-09-21 05:43:11 +00008019 assert(pbuf[0] == '0');
Tim Petersfff53252001-04-12 18:38:48 +00008020 assert(pbuf[1] == c);
Tim Peters38fd5b62000-09-21 05:43:11 +00008021 *res++ = *pbuf++;
8022 *res++ = *pbuf++;
8023 }
8024 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008025 Py_UNICODE_COPY(res, pbuf, len);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008026 res += len;
8027 rescnt -= len;
8028 while (--width >= len) {
8029 --rescnt;
8030 *res++ = ' ';
8031 }
8032 if (dict && (argidx < arglen) && c != '%') {
8033 PyErr_SetString(PyExc_TypeError,
Raymond Hettinger0ebac972002-05-21 15:14:57 +00008034 "not all arguments converted during string formatting");
Thomas Woutersa96affe2006-03-12 00:29:36 +00008035 Py_XDECREF(temp);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008036 goto onError;
8037 }
8038 Py_XDECREF(temp);
8039 } /* '%' */
8040 } /* until end */
8041 if (argidx < arglen && !dict) {
8042 PyErr_SetString(PyExc_TypeError,
Raymond Hettinger0ebac972002-05-21 15:14:57 +00008043 "not all arguments converted during string formatting");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008044 goto onError;
8045 }
8046
Thomas Woutersa96affe2006-03-12 00:29:36 +00008047 if (_PyUnicode_Resize(&result, reslen - rescnt) < 0)
8048 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008049 if (args_owned) {
8050 Py_DECREF(args);
8051 }
8052 Py_DECREF(uformat);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008053 return (PyObject *)result;
8054
8055 onError:
8056 Py_XDECREF(result);
8057 Py_DECREF(uformat);
8058 if (args_owned) {
8059 Py_DECREF(args);
8060 }
8061 return NULL;
8062}
8063
8064static PyBufferProcs unicode_as_buffer = {
Martin v. Löwis18e16552006-02-15 17:27:45 +00008065 (readbufferproc) unicode_buffer_getreadbuf,
8066 (writebufferproc) unicode_buffer_getwritebuf,
8067 (segcountproc) unicode_buffer_getsegcount,
8068 (charbufferproc) unicode_buffer_getcharbuf,
Guido van Rossumd57fd912000-03-10 22:53:23 +00008069};
8070
Jeremy Hylton938ace62002-07-17 16:30:39 +00008071static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +00008072unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
8073
Tim Peters6d6c1a32001-08-02 04:15:00 +00008074static PyObject *
8075unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
8076{
8077 PyObject *x = NULL;
Martin v. Löwis15e62742006-02-27 16:46:16 +00008078 static char *kwlist[] = {"string", "encoding", "errors", 0};
Tim Peters6d6c1a32001-08-02 04:15:00 +00008079 char *encoding = NULL;
8080 char *errors = NULL;
8081
Guido van Rossume023fe02001-08-30 03:12:59 +00008082 if (type != &PyUnicode_Type)
8083 return unicode_subtype_new(type, args, kwds);
Tim Peters6d6c1a32001-08-02 04:15:00 +00008084 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:unicode",
8085 kwlist, &x, &encoding, &errors))
8086 return NULL;
8087 if (x == NULL)
8088 return (PyObject *)_PyUnicode_New(0);
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00008089 if (encoding == NULL && errors == NULL)
8090 return PyObject_Unicode(x);
8091 else
Tim Peters6d6c1a32001-08-02 04:15:00 +00008092 return PyUnicode_FromEncodedObject(x, encoding, errors);
8093}
8094
Guido van Rossume023fe02001-08-30 03:12:59 +00008095static PyObject *
8096unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
8097{
Tim Petersaf90b3e2001-09-12 05:18:58 +00008098 PyUnicodeObject *tmp, *pnew;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008099 Py_ssize_t n;
Guido van Rossume023fe02001-08-30 03:12:59 +00008100
8101 assert(PyType_IsSubtype(type, &PyUnicode_Type));
8102 tmp = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds);
8103 if (tmp == NULL)
8104 return NULL;
8105 assert(PyUnicode_Check(tmp));
Tim Petersaf90b3e2001-09-12 05:18:58 +00008106 pnew = (PyUnicodeObject *) type->tp_alloc(type, n = tmp->length);
Raymond Hettingerf4667932003-06-28 20:04:25 +00008107 if (pnew == NULL) {
8108 Py_DECREF(tmp);
Guido van Rossume023fe02001-08-30 03:12:59 +00008109 return NULL;
Raymond Hettingerf4667932003-06-28 20:04:25 +00008110 }
Tim Petersaf90b3e2001-09-12 05:18:58 +00008111 pnew->str = PyMem_NEW(Py_UNICODE, n+1);
8112 if (pnew->str == NULL) {
8113 _Py_ForgetReference((PyObject *)pnew);
Neil Schemenauer58aa8612002-04-12 03:07:20 +00008114 PyObject_Del(pnew);
Raymond Hettingerf4667932003-06-28 20:04:25 +00008115 Py_DECREF(tmp);
Neal Norwitzec74f2f2003-02-11 23:05:40 +00008116 return PyErr_NoMemory();
Guido van Rossume023fe02001-08-30 03:12:59 +00008117 }
Tim Petersaf90b3e2001-09-12 05:18:58 +00008118 Py_UNICODE_COPY(pnew->str, tmp->str, n+1);
8119 pnew->length = n;
8120 pnew->hash = tmp->hash;
Guido van Rossume023fe02001-08-30 03:12:59 +00008121 Py_DECREF(tmp);
Tim Petersaf90b3e2001-09-12 05:18:58 +00008122 return (PyObject *)pnew;
Guido van Rossume023fe02001-08-30 03:12:59 +00008123}
8124
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008125PyDoc_STRVAR(unicode_doc,
Tim Peters6d6c1a32001-08-02 04:15:00 +00008126"unicode(string [, encoding[, errors]]) -> object\n\
8127\n\
8128Create a new Unicode object from the given encoded string.\n\
Skip Montanaro35b37a52002-07-26 16:22:46 +00008129encoding defaults to the current default string encoding.\n\
8130errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +00008131
Guido van Rossum50e9fb92006-08-17 05:42:55 +00008132static PyObject *unicode_iter(PyObject *seq);
8133
Guido van Rossumd57fd912000-03-10 22:53:23 +00008134PyTypeObject PyUnicode_Type = {
8135 PyObject_HEAD_INIT(&PyType_Type)
8136 0, /* ob_size */
Guido van Rossum84fc66d2007-05-03 17:18:26 +00008137 "str", /* tp_name */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008138 sizeof(PyUnicodeObject), /* tp_size */
8139 0, /* tp_itemsize */
8140 /* Slots */
Guido van Rossum9475a232001-10-05 20:51:39 +00008141 (destructor)unicode_dealloc, /* tp_dealloc */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008142 0, /* tp_print */
Tim Peters6d6c1a32001-08-02 04:15:00 +00008143 0, /* tp_getattr */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008144 0, /* tp_setattr */
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00008145 0, /* tp_compare */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008146 unicode_repr, /* tp_repr */
Neil Schemenauerce30bc92002-11-18 16:10:18 +00008147 &unicode_as_number, /* tp_as_number */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008148 &unicode_as_sequence, /* tp_as_sequence */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008149 &unicode_as_mapping, /* tp_as_mapping */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008150 (hashfunc) unicode_hash, /* tp_hash*/
8151 0, /* tp_call*/
8152 (reprfunc) unicode_str, /* tp_str */
Tim Peters6d6c1a32001-08-02 04:15:00 +00008153 PyObject_GenericGetAttr, /* tp_getattro */
8154 0, /* tp_setattro */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008155 &unicode_as_buffer, /* tp_as_buffer */
Thomas Wouters27d517b2007-02-25 20:39:11 +00008156 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
8157 Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
Tim Peters6d6c1a32001-08-02 04:15:00 +00008158 unicode_doc, /* tp_doc */
8159 0, /* tp_traverse */
8160 0, /* tp_clear */
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00008161 PyUnicode_RichCompare, /* tp_richcompare */
Tim Peters6d6c1a32001-08-02 04:15:00 +00008162 0, /* tp_weaklistoffset */
Guido van Rossum50e9fb92006-08-17 05:42:55 +00008163 unicode_iter, /* tp_iter */
Tim Peters6d6c1a32001-08-02 04:15:00 +00008164 0, /* tp_iternext */
8165 unicode_methods, /* tp_methods */
8166 0, /* tp_members */
8167 0, /* tp_getset */
Guido van Rossumcacfc072002-05-24 19:01:59 +00008168 &PyBaseString_Type, /* tp_base */
Tim Peters6d6c1a32001-08-02 04:15:00 +00008169 0, /* tp_dict */
8170 0, /* tp_descr_get */
8171 0, /* tp_descr_set */
8172 0, /* tp_dictoffset */
8173 0, /* tp_init */
8174 0, /* tp_alloc */
8175 unicode_new, /* tp_new */
Neil Schemenauer58aa8612002-04-12 03:07:20 +00008176 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008177};
8178
8179/* Initialize the Unicode implementation */
8180
Thomas Wouters78890102000-07-22 19:25:51 +00008181void _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008182{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008183 int i;
8184
Thomas Wouters477c8d52006-05-27 19:21:47 +00008185 /* XXX - move this array to unicodectype.c ? */
8186 Py_UNICODE linebreak[] = {
8187 0x000A, /* LINE FEED */
8188 0x000D, /* CARRIAGE RETURN */
8189 0x001C, /* FILE SEPARATOR */
8190 0x001D, /* GROUP SEPARATOR */
8191 0x001E, /* RECORD SEPARATOR */
8192 0x0085, /* NEXT LINE */
8193 0x2028, /* LINE SEPARATOR */
8194 0x2029, /* PARAGRAPH SEPARATOR */
8195 };
8196
Fred Drakee4315f52000-05-09 19:53:39 +00008197 /* Init the implementation */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008198 unicode_freelist = NULL;
8199 unicode_freelist_size = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008200 unicode_empty = _PyUnicode_New(0);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008201 if (!unicode_empty)
8202 return;
8203
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008204 for (i = 0; i < 256; i++)
8205 unicode_latin1[i] = NULL;
Guido van Rossumcacfc072002-05-24 19:01:59 +00008206 if (PyType_Ready(&PyUnicode_Type) < 0)
8207 Py_FatalError("Can't initialize 'unicode'");
Thomas Wouters477c8d52006-05-27 19:21:47 +00008208
8209 /* initialize the linebreak bloom filter */
8210 bloom_linebreak = make_bloom_mask(
8211 linebreak, sizeof(linebreak) / sizeof(linebreak[0])
8212 );
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008213
8214 PyType_Ready(&EncodingMapType);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008215}
8216
8217/* Finalize the Unicode implementation */
8218
8219void
Thomas Wouters78890102000-07-22 19:25:51 +00008220_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008221{
Barry Warsaw5b4c2282000-10-03 20:45:26 +00008222 PyUnicodeObject *u;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008223 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008224
Guido van Rossum4ae8ef82000-10-03 18:09:04 +00008225 Py_XDECREF(unicode_empty);
8226 unicode_empty = NULL;
Barry Warsaw5b4c2282000-10-03 20:45:26 +00008227
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008228 for (i = 0; i < 256; i++) {
8229 if (unicode_latin1[i]) {
8230 Py_DECREF(unicode_latin1[i]);
8231 unicode_latin1[i] = NULL;
8232 }
8233 }
8234
Barry Warsaw5b4c2282000-10-03 20:45:26 +00008235 for (u = unicode_freelist; u != NULL;) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00008236 PyUnicodeObject *v = u;
8237 u = *(PyUnicodeObject **)u;
Guido van Rossumfd4b9572000-04-10 13:51:10 +00008238 if (v->str)
Guido van Rossumb18618d2000-05-03 23:44:39 +00008239 PyMem_DEL(v->str);
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00008240 Py_XDECREF(v->defenc);
Neil Schemenauer58aa8612002-04-12 03:07:20 +00008241 PyObject_Del(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008242 }
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008243 unicode_freelist = NULL;
8244 unicode_freelist_size = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008245}
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00008246
Guido van Rossum50e9fb92006-08-17 05:42:55 +00008247
8248
8249/********************* Unicode Iterator **************************/
8250
8251typedef struct {
8252 PyObject_HEAD
Guido van Rossum49d6b072006-08-17 21:11:47 +00008253 Py_ssize_t it_index;
Guido van Rossum50e9fb92006-08-17 05:42:55 +00008254 PyUnicodeObject *it_seq; /* Set to NULL when iterator is exhausted */
8255} unicodeiterobject;
8256
8257static void
8258unicodeiter_dealloc(unicodeiterobject *it)
8259{
8260 _PyObject_GC_UNTRACK(it);
8261 Py_XDECREF(it->it_seq);
8262 PyObject_GC_Del(it);
8263}
8264
8265static int
8266unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
8267{
8268 Py_VISIT(it->it_seq);
8269 return 0;
8270}
8271
8272static PyObject *
8273unicodeiter_next(unicodeiterobject *it)
8274{
8275 PyUnicodeObject *seq;
8276 PyObject *item;
8277
8278 assert(it != NULL);
8279 seq = it->it_seq;
8280 if (seq == NULL)
8281 return NULL;
8282 assert(PyUnicode_Check(seq));
8283
8284 if (it->it_index < PyUnicode_GET_SIZE(seq)) {
Guido van Rossum49d6b072006-08-17 21:11:47 +00008285 item = PyUnicode_FromUnicode(
8286 PyUnicode_AS_UNICODE(seq)+it->it_index, 1);
Guido van Rossum50e9fb92006-08-17 05:42:55 +00008287 if (item != NULL)
8288 ++it->it_index;
8289 return item;
8290 }
8291
8292 Py_DECREF(seq);
8293 it->it_seq = NULL;
8294 return NULL;
8295}
8296
8297static PyObject *
8298unicodeiter_len(unicodeiterobject *it)
8299{
8300 Py_ssize_t len = 0;
8301 if (it->it_seq)
8302 len = PyUnicode_GET_SIZE(it->it_seq) - it->it_index;
8303 return PyInt_FromSsize_t(len);
8304}
8305
8306PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
8307
8308static PyMethodDef unicodeiter_methods[] = {
Guido van Rossum49d6b072006-08-17 21:11:47 +00008309 {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
8310 length_hint_doc},
Guido van Rossum50e9fb92006-08-17 05:42:55 +00008311 {NULL, NULL} /* sentinel */
8312};
8313
8314PyTypeObject PyUnicodeIter_Type = {
8315 PyObject_HEAD_INIT(&PyType_Type)
8316 0, /* ob_size */
8317 "unicodeiterator", /* tp_name */
8318 sizeof(unicodeiterobject), /* tp_basicsize */
8319 0, /* tp_itemsize */
8320 /* methods */
Guido van Rossum49d6b072006-08-17 21:11:47 +00008321 (destructor)unicodeiter_dealloc, /* tp_dealloc */
Guido van Rossum50e9fb92006-08-17 05:42:55 +00008322 0, /* tp_print */
8323 0, /* tp_getattr */
8324 0, /* tp_setattr */
8325 0, /* tp_compare */
8326 0, /* tp_repr */
8327 0, /* tp_as_number */
8328 0, /* tp_as_sequence */
8329 0, /* tp_as_mapping */
8330 0, /* tp_hash */
8331 0, /* tp_call */
8332 0, /* tp_str */
8333 PyObject_GenericGetAttr, /* tp_getattro */
8334 0, /* tp_setattro */
8335 0, /* tp_as_buffer */
8336 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
8337 0, /* tp_doc */
8338 (traverseproc)unicodeiter_traverse, /* tp_traverse */
8339 0, /* tp_clear */
8340 0, /* tp_richcompare */
8341 0, /* tp_weaklistoffset */
8342 PyObject_SelfIter, /* tp_iter */
8343 (iternextfunc)unicodeiter_next, /* tp_iternext */
8344 unicodeiter_methods, /* tp_methods */
8345 0,
8346};
8347
8348static PyObject *
8349unicode_iter(PyObject *seq)
8350{
8351 unicodeiterobject *it;
8352
8353 if (!PyUnicode_Check(seq)) {
8354 PyErr_BadInternalCall();
8355 return NULL;
8356 }
8357 it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
8358 if (it == NULL)
8359 return NULL;
8360 it->it_index = 0;
8361 Py_INCREF(seq);
8362 it->it_seq = (PyUnicodeObject *)seq;
8363 _PyObject_GC_TRACK(it);
8364 return (PyObject *)it;
8365}
8366
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008367#ifdef __cplusplus
8368}
8369#endif
8370
8371
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00008372/*
8373Local variables:
8374c-basic-offset: 4
8375indent-tabs-mode: nil
8376End:
8377*/