blob: 61069804abefa40935014af6e306a88bd81afe2b [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Fred Drake785d14f2000-05-09 19:54:43 +00004modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
Guido van Rossumd57fd912000-03-10 22:53:23 +00005Unicode Integration Proposal (see file Misc/unicode.txt).
6
Thomas Wouters477c8d52006-05-27 19:21:47 +00007Major speed upgrades to the method implementations at the Reykjavik
8NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
9
Guido van Rossum16b1ad92000-08-03 16:24:25 +000010Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000011
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000012--------------------------------------------------------------------
13The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000014
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000015 Copyright (c) 1999 by Secret Labs AB
16 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000017
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000018By obtaining, using, and/or copying this software and/or its
19associated documentation, you agree that you have read, understood,
20and will comply with the following terms and conditions:
21
22Permission to use, copy, modify, and distribute this software and its
23associated documentation for any purpose and without fee is hereby
24granted, provided that the above copyright notice appears in all
25copies, and that both that copyright notice and this permission notice
26appear in supporting documentation, and that the name of Secret Labs
27AB or the author not be used in advertising or publicity pertaining to
28distribution of the software without specific, written prior
29permission.
30
31SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
32THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
33FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
34ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
35WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
36ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
37OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
38--------------------------------------------------------------------
39
40*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000041
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000042#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000043#include "Python.h"
44
Guido van Rossumd57fd912000-03-10 22:53:23 +000045#include "unicodeobject.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000046#include "ucnhash.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000047
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000048#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000049#include <windows.h>
50#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000051
Guido van Rossumd57fd912000-03-10 22:53:23 +000052/* Limit for the Unicode object free list */
53
54#define MAX_UNICODE_FREELIST_SIZE 1024
55
56/* Limit for the Unicode object free list stay alive optimization.
57
58 The implementation will keep allocated Unicode memory intact for
59 all objects on the free list having a size less than this
Tim Petersced69f82003-09-16 20:30:58 +000060 limit. This reduces malloc() overhead for small Unicode objects.
Guido van Rossumd57fd912000-03-10 22:53:23 +000061
Barry Warsaw51ac5802000-03-20 16:36:48 +000062 At worst this will result in MAX_UNICODE_FREELIST_SIZE *
Guido van Rossumfd4b9572000-04-10 13:51:10 +000063 (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
Guido van Rossumd57fd912000-03-10 22:53:23 +000064 malloc()-overhead) bytes of unused garbage.
65
66 Setting the limit to 0 effectively turns the feature off.
67
Guido van Rossumfd4b9572000-04-10 13:51:10 +000068 Note: This is an experimental feature ! If you get core dumps when
69 using Unicode objects, turn this feature off.
Guido van Rossumd57fd912000-03-10 22:53:23 +000070
71*/
72
Guido van Rossumfd4b9572000-04-10 13:51:10 +000073#define KEEPALIVE_SIZE_LIMIT 9
Guido van Rossumd57fd912000-03-10 22:53:23 +000074
75/* Endianness switches; defaults to little endian */
76
77#ifdef WORDS_BIGENDIAN
78# define BYTEORDER_IS_BIG_ENDIAN
79#else
80# define BYTEORDER_IS_LITTLE_ENDIAN
81#endif
82
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000083/* --- Globals ------------------------------------------------------------
84
85 The globals are initialized by the _PyUnicode_Init() API and should
86 not be used before calling that API.
87
88*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000089
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000090
91#ifdef __cplusplus
92extern "C" {
93#endif
94
Guido van Rossumd57fd912000-03-10 22:53:23 +000095/* Free list for Unicode objects */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000096static PyUnicodeObject *unicode_freelist;
97static int unicode_freelist_size;
Guido van Rossumd57fd912000-03-10 22:53:23 +000098
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000099/* The empty Unicode object is shared to improve performance. */
100static PyUnicodeObject *unicode_empty;
101
102/* Single character Unicode strings in the Latin-1 range are being
103 shared as well. */
104static PyUnicodeObject *unicode_latin1[256];
105
Fred Drakee4315f52000-05-09 19:53:39 +0000106/* Default encoding to use and assume when NULL is passed as encoding
Guido van Rossumf15a29f2007-05-04 00:41:39 +0000107 parameter; it is fixed to "utf-8". Always use the
108 PyUnicode_GetDefaultEncoding() API to access this global. */
109static const char unicode_default_encoding[] = "utf-8";
Fred Drakee4315f52000-05-09 19:53:39 +0000110
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000111Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000112PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000113{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000114#ifdef Py_UNICODE_WIDE
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000115 return 0x10FFFF;
116#else
117 /* This is actually an illegal character, so it should
118 not be passed to unichr. */
119 return 0xFFFF;
120#endif
121}
122
Thomas Wouters477c8d52006-05-27 19:21:47 +0000123/* --- Bloom Filters ----------------------------------------------------- */
124
125/* stuff to implement simple "bloom filters" for Unicode characters.
126 to keep things simple, we use a single bitmask, using the least 5
127 bits from each unicode characters as the bit index. */
128
129/* the linebreak mask is set up by Unicode_Init below */
130
131#define BLOOM_MASK unsigned long
132
133static BLOOM_MASK bloom_linebreak;
134
135#define BLOOM(mask, ch) ((mask & (1 << ((ch) & 0x1F))))
136
137#define BLOOM_LINEBREAK(ch)\
138 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK((ch)))
139
140Py_LOCAL_INLINE(BLOOM_MASK) make_bloom_mask(Py_UNICODE* ptr, Py_ssize_t len)
141{
142 /* calculate simple bloom-style bitmask for a given unicode string */
143
144 long mask;
145 Py_ssize_t i;
146
147 mask = 0;
148 for (i = 0; i < len; i++)
149 mask |= (1 << (ptr[i] & 0x1F));
150
151 return mask;
152}
153
154Py_LOCAL_INLINE(int) unicode_member(Py_UNICODE chr, Py_UNICODE* set, Py_ssize_t setlen)
155{
156 Py_ssize_t i;
157
158 for (i = 0; i < setlen; i++)
159 if (set[i] == chr)
160 return 1;
161
162 return 0;
163}
164
165#define BLOOM_MEMBER(mask, chr, set, setlen)\
166 BLOOM(mask, chr) && unicode_member(chr, set, setlen)
167
Guido van Rossumd57fd912000-03-10 22:53:23 +0000168/* --- Unicode Object ----------------------------------------------------- */
169
170static
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000171int unicode_resize(register PyUnicodeObject *unicode,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000172 Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000173{
174 void *oldstr;
Tim Petersced69f82003-09-16 20:30:58 +0000175
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000176 /* Shortcut if there's nothing much to do. */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000177 if (unicode->length == length)
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000178 goto reset;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000179
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000180 /* Resizing shared object (unicode_empty or single character
181 objects) in-place is not allowed. Use PyUnicode_Resize()
182 instead ! */
Thomas Wouters477c8d52006-05-27 19:21:47 +0000183
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000184 if (unicode == unicode_empty ||
185 (unicode->length == 1 &&
186 unicode->str[0] < 256U &&
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000187 unicode_latin1[unicode->str[0]] == unicode)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000188 PyErr_SetString(PyExc_SystemError,
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000189 "can't resize shared unicode objects");
Guido van Rossumd57fd912000-03-10 22:53:23 +0000190 return -1;
191 }
192
Thomas Wouters477c8d52006-05-27 19:21:47 +0000193 /* We allocate one more byte to make sure the string is Ux0000 terminated.
194 The overallocation is also used by fastsearch, which assumes that it's
195 safe to look at str[length] (without making any assumptions about what
196 it contains). */
197
Guido van Rossumd57fd912000-03-10 22:53:23 +0000198 oldstr = unicode->str;
199 PyMem_RESIZE(unicode->str, Py_UNICODE, length + 1);
200 if (!unicode->str) {
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000201 unicode->str = (Py_UNICODE *)oldstr;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000202 PyErr_NoMemory();
203 return -1;
204 }
205 unicode->str[length] = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000206 unicode->length = length;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000207
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000208 reset:
Guido van Rossumd57fd912000-03-10 22:53:23 +0000209 /* Reset the object caches */
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000210 if (unicode->defenc) {
211 Py_DECREF(unicode->defenc);
212 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000213 }
214 unicode->hash = -1;
Tim Petersced69f82003-09-16 20:30:58 +0000215
Guido van Rossumd57fd912000-03-10 22:53:23 +0000216 return 0;
217}
218
219/* We allocate one more byte to make sure the string is
Tim Petersced69f82003-09-16 20:30:58 +0000220 Ux0000 terminated -- XXX is this needed ?
Guido van Rossumd57fd912000-03-10 22:53:23 +0000221
222 XXX This allocator could further be enhanced by assuring that the
223 free list never reduces its size below 1.
224
225*/
226
227static
Martin v. Löwis18e16552006-02-15 17:27:45 +0000228PyUnicodeObject *_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000229{
230 register PyUnicodeObject *unicode;
231
Thomas Wouters477c8d52006-05-27 19:21:47 +0000232 /* Optimization for empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000233 if (length == 0 && unicode_empty != NULL) {
234 Py_INCREF(unicode_empty);
235 return unicode_empty;
236 }
237
238 /* Unicode freelist & memory allocation */
239 if (unicode_freelist) {
240 unicode = unicode_freelist;
Marc-André Lemburgbea47e72000-06-17 20:31:17 +0000241 unicode_freelist = *(PyUnicodeObject **)unicode;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000242 unicode_freelist_size--;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000243 if (unicode->str) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000244 /* Keep-Alive optimization: we only upsize the buffer,
245 never downsize it. */
Tim Petersced69f82003-09-16 20:30:58 +0000246 if ((unicode->length < length) &&
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +0000247 unicode_resize(unicode, length) < 0) {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000248 PyMem_DEL(unicode->str);
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000249 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000250 }
251 }
Guido van Rossumad98db12001-06-14 17:52:02 +0000252 else {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000253 unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
Guido van Rossumad98db12001-06-14 17:52:02 +0000254 }
255 PyObject_INIT(unicode, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000256 }
257 else {
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000258 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000259 if (unicode == NULL)
260 return NULL;
261 unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
262 }
263
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000264 if (!unicode->str) {
265 PyErr_NoMemory();
Barry Warsaw51ac5802000-03-20 16:36:48 +0000266 goto onError;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000267 }
Jeremy Hyltond8082792003-09-16 19:41:39 +0000268 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +0000269 * the caller fails before initializing str -- unicode_resize()
270 * reads str[0], and the Keep-Alive optimization can keep memory
271 * allocated for str alive across a call to unicode_dealloc(unicode).
272 * We don't want unicode_resize to read uninitialized memory in
273 * that case.
274 */
Jeremy Hyltond8082792003-09-16 19:41:39 +0000275 unicode->str[0] = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000276 unicode->str[length] = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000277 unicode->length = length;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000278 unicode->hash = -1;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000279 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000280 return unicode;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000281
282 onError:
283 _Py_ForgetReference((PyObject *)unicode);
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000284 PyObject_Del(unicode);
Barry Warsaw51ac5802000-03-20 16:36:48 +0000285 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000286}
287
288static
Guido van Rossum9475a232001-10-05 20:51:39 +0000289void unicode_dealloc(register PyUnicodeObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000290{
Guido van Rossum604ddf82001-12-06 20:03:56 +0000291 if (PyUnicode_CheckExact(unicode) &&
292 unicode_freelist_size < MAX_UNICODE_FREELIST_SIZE) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000293 /* Keep-Alive optimization */
294 if (unicode->length >= KEEPALIVE_SIZE_LIMIT) {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000295 PyMem_DEL(unicode->str);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000296 unicode->str = NULL;
297 unicode->length = 0;
298 }
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000299 if (unicode->defenc) {
300 Py_DECREF(unicode->defenc);
301 unicode->defenc = NULL;
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000302 }
303 /* Add to free list */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000304 *(PyUnicodeObject **)unicode = unicode_freelist;
305 unicode_freelist = unicode;
306 unicode_freelist_size++;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000307 }
308 else {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000309 PyMem_DEL(unicode->str);
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000310 Py_XDECREF(unicode->defenc);
Guido van Rossum604ddf82001-12-06 20:03:56 +0000311 unicode->ob_type->tp_free((PyObject *)unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000312 }
313}
314
Martin v. Löwis18e16552006-02-15 17:27:45 +0000315int PyUnicode_Resize(PyObject **unicode, Py_ssize_t length)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000316{
317 register PyUnicodeObject *v;
318
319 /* Argument checks */
320 if (unicode == NULL) {
321 PyErr_BadInternalCall();
322 return -1;
323 }
324 v = (PyUnicodeObject *)*unicode;
Guido van Rossum049cd6b2002-10-11 00:43:48 +0000325 if (v == NULL || !PyUnicode_Check(v) || v->ob_refcnt != 1 || length < 0) {
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000326 PyErr_BadInternalCall();
327 return -1;
328 }
329
330 /* Resizing unicode_empty and single character objects is not
331 possible since these are being shared. We simply return a fresh
332 copy with the same Unicode content. */
Tim Petersced69f82003-09-16 20:30:58 +0000333 if (v->length != length &&
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000334 (v == unicode_empty || v->length == 1)) {
335 PyUnicodeObject *w = _PyUnicode_New(length);
336 if (w == NULL)
337 return -1;
338 Py_UNICODE_COPY(w->str, v->str,
339 length < v->length ? length : v->length);
Raymond Hettingerc8df5782003-03-09 07:30:43 +0000340 Py_DECREF(*unicode);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000341 *unicode = (PyObject *)w;
342 return 0;
343 }
344
345 /* Note that we don't have to modify *unicode for unshared Unicode
346 objects, since we can modify them in-place. */
347 return unicode_resize(v, length);
348}
349
350/* Internal API for use in unicodeobject.c only ! */
351#define _PyUnicode_Resize(unicodevar, length) \
352 PyUnicode_Resize(((PyObject **)(unicodevar)), length)
353
Guido van Rossumd57fd912000-03-10 22:53:23 +0000354PyObject *PyUnicode_FromUnicode(const Py_UNICODE *u,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000355 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000356{
357 PyUnicodeObject *unicode;
358
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000359 /* If the Unicode data is known at construction time, we can apply
360 some optimizations which share commonly used objects. */
361 if (u != NULL) {
362
363 /* Optimization for empty strings */
364 if (size == 0 && unicode_empty != NULL) {
365 Py_INCREF(unicode_empty);
366 return (PyObject *)unicode_empty;
367 }
368
369 /* Single character Unicode objects in the Latin-1 range are
370 shared when using this constructor */
371 if (size == 1 && *u < 256) {
372 unicode = unicode_latin1[*u];
373 if (!unicode) {
374 unicode = _PyUnicode_New(1);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000375 if (!unicode)
376 return NULL;
Marc-André Lemburg8879a332001-06-07 12:26:56 +0000377 unicode->str[0] = *u;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000378 unicode_latin1[*u] = unicode;
379 }
380 Py_INCREF(unicode);
381 return (PyObject *)unicode;
382 }
383 }
Tim Petersced69f82003-09-16 20:30:58 +0000384
Guido van Rossumd57fd912000-03-10 22:53:23 +0000385 unicode = _PyUnicode_New(size);
386 if (!unicode)
387 return NULL;
388
389 /* Copy the Unicode data into the new object */
390 if (u != NULL)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000391 Py_UNICODE_COPY(unicode->str, u, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000392
393 return (PyObject *)unicode;
394}
395
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000396PyObject *PyUnicode_FromString(const char *u)
397{
398 PyUnicodeObject *unicode;
Walter Dörwalda14c4bb2007-05-06 10:00:02 +0000399 size_t size = strlen(u);
400 if (size > PY_SSIZE_T_MAX) {
401 PyErr_SetString(PyExc_OverflowError, "input too long");
402 return NULL;
403 }
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000404
405 /* If the Unicode data is known at construction time, we can apply
406 some optimizations which share commonly used objects. */
407 if (u != NULL) {
408
409 /* Optimization for empty strings */
410 if (size == 0 && unicode_empty != NULL) {
411 Py_INCREF(unicode_empty);
412 return (PyObject *)unicode_empty;
413 }
414
Walter Dörwald071b9da2007-05-05 14:21:20 +0000415 /* Single characters are shared when using this constructor */
416 if (size == 1) {
Walter Dörwaldce32db32007-05-05 14:26:59 +0000417 unicode = unicode_latin1[(int)*u];
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000418 if (!unicode) {
419 unicode = _PyUnicode_New(1);
420 if (!unicode)
421 return NULL;
422 unicode->str[0] = *u;
Walter Dörwaldce32db32007-05-05 14:26:59 +0000423 unicode_latin1[(int)*u] = unicode;
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000424 }
425 Py_INCREF(unicode);
426 return (PyObject *)unicode;
427 }
428 }
429
430 unicode = _PyUnicode_New(size);
431 if (!unicode)
432 return NULL;
433
434 /* Copy the Unicode data into the new object */
435 if (u != NULL) {
Walter Dörwaldce32db32007-05-05 14:26:59 +0000436 Py_UNICODE *p = unicode->str;
437 while ((*p++ = *u++))
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000438 ;
439 }
440
441 return (PyObject *)unicode;
442}
443
Guido van Rossumd57fd912000-03-10 22:53:23 +0000444#ifdef HAVE_WCHAR_H
445
446PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000447 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000448{
449 PyUnicodeObject *unicode;
450
451 if (w == NULL) {
452 PyErr_BadInternalCall();
453 return NULL;
454 }
455
456 unicode = _PyUnicode_New(size);
457 if (!unicode)
458 return NULL;
459
460 /* Copy the wchar_t data into the new object */
461#ifdef HAVE_USABLE_WCHAR_T
462 memcpy(unicode->str, w, size * sizeof(wchar_t));
Tim Petersced69f82003-09-16 20:30:58 +0000463#else
Guido van Rossumd57fd912000-03-10 22:53:23 +0000464 {
465 register Py_UNICODE *u;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000466 register Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000467 u = PyUnicode_AS_UNICODE(unicode);
Marc-André Lemburg204bd6d2004-10-15 07:45:05 +0000468 for (i = size; i > 0; i--)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000469 *u++ = *w++;
470 }
471#endif
472
473 return (PyObject *)unicode;
474}
475
Martin v. Löwis18e16552006-02-15 17:27:45 +0000476Py_ssize_t PyUnicode_AsWideChar(PyUnicodeObject *unicode,
477 wchar_t *w,
478 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000479{
480 if (unicode == NULL) {
481 PyErr_BadInternalCall();
482 return -1;
483 }
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +0000484
485 /* If possible, try to copy the 0-termination as well */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000486 if (size > PyUnicode_GET_SIZE(unicode))
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +0000487 size = PyUnicode_GET_SIZE(unicode) + 1;
488
Guido van Rossumd57fd912000-03-10 22:53:23 +0000489#ifdef HAVE_USABLE_WCHAR_T
490 memcpy(w, unicode->str, size * sizeof(wchar_t));
491#else
492 {
493 register Py_UNICODE *u;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000494 register Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000495 u = PyUnicode_AS_UNICODE(unicode);
Marc-André Lemburg204bd6d2004-10-15 07:45:05 +0000496 for (i = size; i > 0; i--)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000497 *w++ = *u++;
498 }
499#endif
500
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +0000501 if (size > PyUnicode_GET_SIZE(unicode))
502 return PyUnicode_GET_SIZE(unicode);
503 else
Guido van Rossumd57fd912000-03-10 22:53:23 +0000504 return size;
505}
506
507#endif
508
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000509PyObject *PyUnicode_FromOrdinal(int ordinal)
510{
Hye-Shik Chang40574832004-04-06 07:24:51 +0000511 Py_UNICODE s[1];
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000512
513#ifdef Py_UNICODE_WIDE
514 if (ordinal < 0 || ordinal > 0x10ffff) {
515 PyErr_SetString(PyExc_ValueError,
516 "unichr() arg not in range(0x110000) "
517 "(wide Python build)");
518 return NULL;
519 }
520#else
521 if (ordinal < 0 || ordinal > 0xffff) {
522 PyErr_SetString(PyExc_ValueError,
523 "unichr() arg not in range(0x10000) "
524 "(narrow Python build)");
525 return NULL;
526 }
527#endif
528
Hye-Shik Chang40574832004-04-06 07:24:51 +0000529 s[0] = (Py_UNICODE)ordinal;
530 return PyUnicode_FromUnicode(s, 1);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000531}
532
Guido van Rossumd57fd912000-03-10 22:53:23 +0000533PyObject *PyUnicode_FromObject(register PyObject *obj)
534{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000535 /* XXX Perhaps we should make this API an alias of
536 PyObject_Unicode() instead ?! */
537 if (PyUnicode_CheckExact(obj)) {
538 Py_INCREF(obj);
539 return obj;
540 }
541 if (PyUnicode_Check(obj)) {
542 /* For a Unicode subtype that's not a Unicode object,
543 return a true Unicode object with the same data. */
544 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(obj),
545 PyUnicode_GET_SIZE(obj));
546 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000547 return PyUnicode_FromEncodedObject(obj, NULL, "strict");
548}
549
550PyObject *PyUnicode_FromEncodedObject(register PyObject *obj,
551 const char *encoding,
552 const char *errors)
553{
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000554 const char *s = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +0000555 Py_ssize_t len;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000556 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +0000557
Guido van Rossumd57fd912000-03-10 22:53:23 +0000558 if (obj == NULL) {
559 PyErr_BadInternalCall();
560 return NULL;
561 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000562
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000563#if 0
564 /* For b/w compatibility we also accept Unicode objects provided
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +0000565 that no encodings is given and then redirect to
566 PyObject_Unicode() which then applies the additional logic for
567 Unicode subclasses.
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000568
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000569 NOTE: This API should really only be used for object which
570 represent *encoded* Unicode !
571
572 */
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000573 if (PyUnicode_Check(obj)) {
574 if (encoding) {
575 PyErr_SetString(PyExc_TypeError,
576 "decoding Unicode is not supported");
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000577 return NULL;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000578 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000579 return PyObject_Unicode(obj);
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000580 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000581#else
582 if (PyUnicode_Check(obj)) {
583 PyErr_SetString(PyExc_TypeError,
584 "decoding Unicode is not supported");
585 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000586 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000587#endif
588
589 /* Coerce object */
590 if (PyString_Check(obj)) {
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000591 s = PyString_AS_STRING(obj);
592 len = PyString_GET_SIZE(obj);
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000593 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000594 else if (PyObject_AsCharBuffer(obj, &s, &len)) {
595 /* Overwrite the error message with something more useful in
596 case of a TypeError. */
597 if (PyErr_ExceptionMatches(PyExc_TypeError))
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000598 PyErr_Format(PyExc_TypeError,
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000599 "coercing to Unicode: need string or buffer, "
600 "%.80s found",
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000601 obj->ob_type->tp_name);
602 goto onError;
603 }
Tim Petersced69f82003-09-16 20:30:58 +0000604
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000605 /* Convert to Unicode */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000606 if (len == 0) {
607 Py_INCREF(unicode_empty);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000608 v = (PyObject *)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000609 }
Tim Petersced69f82003-09-16 20:30:58 +0000610 else
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000611 v = PyUnicode_Decode(s, len, encoding, errors);
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +0000612
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000613 return v;
614
615 onError:
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000616 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000617}
618
619PyObject *PyUnicode_Decode(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000620 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +0000621 const char *encoding,
622 const char *errors)
623{
624 PyObject *buffer = NULL, *unicode;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000625
626 if (encoding == NULL)
Fred Drakee4315f52000-05-09 19:53:39 +0000627 encoding = PyUnicode_GetDefaultEncoding();
628
629 /* Shortcuts for common default encodings */
630 if (strcmp(encoding, "utf-8") == 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000631 return PyUnicode_DecodeUTF8(s, size, errors);
Fred Drakee4315f52000-05-09 19:53:39 +0000632 else if (strcmp(encoding, "latin-1") == 0)
633 return PyUnicode_DecodeLatin1(s, size, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +0000634#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
635 else if (strcmp(encoding, "mbcs") == 0)
636 return PyUnicode_DecodeMBCS(s, size, errors);
637#endif
Fred Drakee4315f52000-05-09 19:53:39 +0000638 else if (strcmp(encoding, "ascii") == 0)
639 return PyUnicode_DecodeASCII(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000640
641 /* Decode via the codec registry */
642 buffer = PyBuffer_FromMemory((void *)s, size);
643 if (buffer == NULL)
644 goto onError;
645 unicode = PyCodec_Decode(buffer, encoding, errors);
646 if (unicode == NULL)
647 goto onError;
648 if (!PyUnicode_Check(unicode)) {
649 PyErr_Format(PyExc_TypeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000650 "decoder did not return an unicode object (type=%.400s)",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000651 unicode->ob_type->tp_name);
652 Py_DECREF(unicode);
653 goto onError;
654 }
655 Py_DECREF(buffer);
656 return unicode;
Tim Petersced69f82003-09-16 20:30:58 +0000657
Guido van Rossumd57fd912000-03-10 22:53:23 +0000658 onError:
659 Py_XDECREF(buffer);
660 return NULL;
661}
662
Marc-André Lemburgd2d45982004-07-08 17:57:32 +0000663PyObject *PyUnicode_AsDecodedObject(PyObject *unicode,
664 const char *encoding,
665 const char *errors)
666{
667 PyObject *v;
668
669 if (!PyUnicode_Check(unicode)) {
670 PyErr_BadArgument();
671 goto onError;
672 }
673
674 if (encoding == NULL)
675 encoding = PyUnicode_GetDefaultEncoding();
676
677 /* Decode via the codec registry */
678 v = PyCodec_Decode(unicode, encoding, errors);
679 if (v == NULL)
680 goto onError;
681 return v;
682
683 onError:
684 return NULL;
685}
686
Guido van Rossumd57fd912000-03-10 22:53:23 +0000687PyObject *PyUnicode_Encode(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000688 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +0000689 const char *encoding,
690 const char *errors)
691{
692 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +0000693
Guido van Rossumd57fd912000-03-10 22:53:23 +0000694 unicode = PyUnicode_FromUnicode(s, size);
695 if (unicode == NULL)
696 return NULL;
697 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
698 Py_DECREF(unicode);
699 return v;
700}
701
Marc-André Lemburgd2d45982004-07-08 17:57:32 +0000702PyObject *PyUnicode_AsEncodedObject(PyObject *unicode,
703 const char *encoding,
704 const char *errors)
705{
706 PyObject *v;
707
708 if (!PyUnicode_Check(unicode)) {
709 PyErr_BadArgument();
710 goto onError;
711 }
712
713 if (encoding == NULL)
714 encoding = PyUnicode_GetDefaultEncoding();
715
716 /* Encode via the codec registry */
717 v = PyCodec_Encode(unicode, encoding, errors);
718 if (v == NULL)
719 goto onError;
720 return v;
721
722 onError:
723 return NULL;
724}
725
Guido van Rossumd57fd912000-03-10 22:53:23 +0000726PyObject *PyUnicode_AsEncodedString(PyObject *unicode,
727 const char *encoding,
728 const char *errors)
729{
730 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +0000731
Guido van Rossumd57fd912000-03-10 22:53:23 +0000732 if (!PyUnicode_Check(unicode)) {
733 PyErr_BadArgument();
734 goto onError;
735 }
Fred Drakee4315f52000-05-09 19:53:39 +0000736
Tim Petersced69f82003-09-16 20:30:58 +0000737 if (encoding == NULL)
Fred Drakee4315f52000-05-09 19:53:39 +0000738 encoding = PyUnicode_GetDefaultEncoding();
739
740 /* Shortcuts for common default encodings */
741 if (errors == NULL) {
742 if (strcmp(encoding, "utf-8") == 0)
Jeremy Hylton9cea41c2001-05-29 17:13:15 +0000743 return PyUnicode_AsUTF8String(unicode);
Fred Drakee4315f52000-05-09 19:53:39 +0000744 else if (strcmp(encoding, "latin-1") == 0)
745 return PyUnicode_AsLatin1String(unicode);
Mark Hammond0ccda1e2003-07-01 00:13:27 +0000746#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
747 else if (strcmp(encoding, "mbcs") == 0)
748 return PyUnicode_AsMBCSString(unicode);
749#endif
Fred Drakee4315f52000-05-09 19:53:39 +0000750 else if (strcmp(encoding, "ascii") == 0)
751 return PyUnicode_AsASCIIString(unicode);
752 }
Guido van Rossumd57fd912000-03-10 22:53:23 +0000753
754 /* Encode via the codec registry */
755 v = PyCodec_Encode(unicode, encoding, errors);
756 if (v == NULL)
757 goto onError;
Guido van Rossumf15a29f2007-05-04 00:41:39 +0000758 if (!PyBytes_Check(v)) {
759 if (PyString_Check(v)) {
760 /* Old codec, turn it into bytes */
761 PyObject *b = PyBytes_FromObject(v);
762 Py_DECREF(v);
763 return b;
764 }
Guido van Rossumd57fd912000-03-10 22:53:23 +0000765 PyErr_Format(PyExc_TypeError,
Guido van Rossumf15a29f2007-05-04 00:41:39 +0000766 "encoder did not return a bytes object "
767 "(type=%.400s, encoding=%.20s, errors=%.20s)",
768 v->ob_type->tp_name,
769 encoding ? encoding : "NULL",
770 errors ? errors : "NULL");
Guido van Rossumd57fd912000-03-10 22:53:23 +0000771 Py_DECREF(v);
772 goto onError;
773 }
774 return v;
Tim Petersced69f82003-09-16 20:30:58 +0000775
Guido van Rossumd57fd912000-03-10 22:53:23 +0000776 onError:
777 return NULL;
778}
779
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000780PyObject *_PyUnicode_AsDefaultEncodedString(PyObject *unicode,
781 const char *errors)
782{
783 PyObject *v = ((PyUnicodeObject *)unicode)->defenc;
Guido van Rossumf15a29f2007-05-04 00:41:39 +0000784 PyObject *b;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000785 if (v)
786 return v;
Guido van Rossumf15a29f2007-05-04 00:41:39 +0000787 if (errors != NULL)
788 Py_FatalError("non-NULL encoding in _PyUnicode_AsDefaultEncodedString");
789 if (errors == NULL) {
790 b = PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
791 PyUnicode_GET_SIZE(unicode),
792 NULL);
793 }
794 else {
795 b = PyUnicode_AsEncodedString(unicode, NULL, errors);
796 }
797 if (!b)
798 return NULL;
799 v = PyString_FromStringAndSize(PyBytes_AsString(b),
800 PyBytes_Size(b));
801 Py_DECREF(b);
802 if (!errors) {
803 Py_XINCREF(v);
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000804 ((PyUnicodeObject *)unicode)->defenc = v;
Guido van Rossumf15a29f2007-05-04 00:41:39 +0000805 }
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000806 return v;
807}
808
Guido van Rossumd57fd912000-03-10 22:53:23 +0000809Py_UNICODE *PyUnicode_AsUnicode(PyObject *unicode)
810{
811 if (!PyUnicode_Check(unicode)) {
812 PyErr_BadArgument();
813 goto onError;
814 }
815 return PyUnicode_AS_UNICODE(unicode);
816
817 onError:
818 return NULL;
819}
820
Martin v. Löwis18e16552006-02-15 17:27:45 +0000821Py_ssize_t PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000822{
823 if (!PyUnicode_Check(unicode)) {
824 PyErr_BadArgument();
825 goto onError;
826 }
827 return PyUnicode_GET_SIZE(unicode);
828
829 onError:
830 return -1;
831}
832
Thomas Wouters78890102000-07-22 19:25:51 +0000833const char *PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +0000834{
835 return unicode_default_encoding;
836}
837
838int PyUnicode_SetDefaultEncoding(const char *encoding)
839{
Guido van Rossumf15a29f2007-05-04 00:41:39 +0000840 if (strcmp(encoding, unicode_default_encoding) != 0) {
841 PyErr_Format(PyExc_ValueError,
842 "Can only set default encoding to %s",
843 unicode_default_encoding);
844 return -1;
845 }
Fred Drakee4315f52000-05-09 19:53:39 +0000846 return 0;
Fred Drakee4315f52000-05-09 19:53:39 +0000847}
848
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000849/* error handling callback helper:
850 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +0000851 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000852 and adjust various state variables.
853 return 0 on success, -1 on error
854*/
855
856static
857int unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
858 const char *encoding, const char *reason,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000859 const char *input, Py_ssize_t insize, Py_ssize_t *startinpos, Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
860 PyObject **output, Py_ssize_t *outpos, Py_UNICODE **outptr)
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000861{
Martin v. Löwis18e16552006-02-15 17:27:45 +0000862 static char *argparse = "O!n;decoding error handler must return (unicode, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000863
864 PyObject *restuple = NULL;
865 PyObject *repunicode = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +0000866 Py_ssize_t outsize = PyUnicode_GET_SIZE(*output);
867 Py_ssize_t requiredsize;
868 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000869 Py_UNICODE *repptr;
Martin v. Löwis18e16552006-02-15 17:27:45 +0000870 Py_ssize_t repsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000871 int res = -1;
872
873 if (*errorHandler == NULL) {
874 *errorHandler = PyCodec_LookupError(errors);
875 if (*errorHandler == NULL)
876 goto onError;
877 }
878
879 if (*exceptionObject == NULL) {
880 *exceptionObject = PyUnicodeDecodeError_Create(
881 encoding, input, insize, *startinpos, *endinpos, reason);
882 if (*exceptionObject == NULL)
883 goto onError;
884 }
885 else {
886 if (PyUnicodeDecodeError_SetStart(*exceptionObject, *startinpos))
887 goto onError;
888 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, *endinpos))
889 goto onError;
890 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
891 goto onError;
892 }
893
894 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
895 if (restuple == NULL)
896 goto onError;
897 if (!PyTuple_Check(restuple)) {
898 PyErr_Format(PyExc_TypeError, &argparse[4]);
899 goto onError;
900 }
901 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
902 goto onError;
903 if (newpos<0)
Walter Dörwald2e0b18a2003-01-31 17:19:08 +0000904 newpos = insize+newpos;
905 if (newpos<0 || newpos>insize) {
Martin v. Löwis2c95cc62006-02-16 06:54:25 +0000906 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
Walter Dörwald2e0b18a2003-01-31 17:19:08 +0000907 goto onError;
908 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000909
910 /* need more space? (at least enough for what we
911 have+the replacement+the rest of the string (starting
912 at the new input position), so we won't have to check space
913 when there are no errors in the rest of the string) */
914 repptr = PyUnicode_AS_UNICODE(repunicode);
915 repsize = PyUnicode_GET_SIZE(repunicode);
916 requiredsize = *outpos + repsize + insize-newpos;
917 if (requiredsize > outsize) {
918 if (requiredsize<2*outsize)
919 requiredsize = 2*outsize;
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +0000920 if (PyUnicode_Resize(output, requiredsize) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000921 goto onError;
922 *outptr = PyUnicode_AS_UNICODE(*output) + *outpos;
923 }
924 *endinpos = newpos;
925 *inptr = input + newpos;
926 Py_UNICODE_COPY(*outptr, repptr, repsize);
927 *outptr += repsize;
928 *outpos += repsize;
929 /* we made it! */
930 res = 0;
931
932 onError:
933 Py_XDECREF(restuple);
934 return res;
935}
936
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000937/* --- UTF-7 Codec -------------------------------------------------------- */
938
939/* see RFC2152 for details */
940
Tim Petersced69f82003-09-16 20:30:58 +0000941static
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000942char utf7_special[128] = {
943 /* indicate whether a UTF-7 character is special i.e. cannot be directly
944 encoded:
945 0 - not special
946 1 - special
947 2 - whitespace (optional)
948 3 - RFC2152 Set O (optional) */
949 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 2, 1, 1,
950 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
951 2, 3, 3, 3, 3, 3, 3, 0, 0, 0, 3, 1, 0, 0, 0, 1,
952 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 0,
953 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
954 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 1, 3, 3, 3,
955 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
956 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 1, 1,
957
958};
959
Marc-André Lemburge115ec82005-10-19 22:33:31 +0000960/* Note: The comparison (c) <= 0 is a trick to work-around gcc
961 warnings about the comparison always being false; since
962 utf7_special[0] is 1, we can safely make that one comparison
963 true */
964
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000965#define SPECIAL(c, encodeO, encodeWS) \
Marc-André Lemburge115ec82005-10-19 22:33:31 +0000966 ((c) > 127 || (c) <= 0 || utf7_special[(c)] == 1 || \
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +0000967 (encodeWS && (utf7_special[(c)] == 2)) || \
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000968 (encodeO && (utf7_special[(c)] == 3)))
969
Marc-André Lemburge115ec82005-10-19 22:33:31 +0000970#define B64(n) \
971 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
972#define B64CHAR(c) \
973 (isalnum(c) || (c) == '+' || (c) == '/')
974#define UB64(c) \
975 ((c) == '+' ? 62 : (c) == '/' ? 63 : (c) >= 'a' ? \
976 (c) - 71 : (c) >= 'A' ? (c) - 65 : (c) + 4 )
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000977
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +0000978#define ENCODE(out, ch, bits) \
979 while (bits >= 6) { \
980 *out++ = B64(ch >> (bits-6)); \
981 bits -= 6; \
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000982 }
983
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +0000984#define DECODE(out, ch, bits, surrogate) \
985 while (bits >= 16) { \
986 Py_UNICODE outCh = (Py_UNICODE) ((ch >> (bits-16)) & 0xffff); \
987 bits -= 16; \
988 if (surrogate) { \
Marc-André Lemburge115ec82005-10-19 22:33:31 +0000989 /* We have already generated an error for the high surrogate \
990 so let's not bother seeing if the low surrogate is correct or not */ \
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +0000991 surrogate = 0; \
992 } else if (0xDC00 <= outCh && outCh <= 0xDFFF) { \
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000993 /* This is a surrogate pair. Unfortunately we can't represent \
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +0000994 it in a 16-bit character */ \
995 surrogate = 1; \
996 errmsg = "code pairs are not supported"; \
997 goto utf7Error; \
998 } else { \
999 *out++ = outCh; \
1000 } \
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001001 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001002
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001003PyObject *PyUnicode_DecodeUTF7(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001004 Py_ssize_t size,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001005 const char *errors)
1006{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001007 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001008 Py_ssize_t startinpos;
1009 Py_ssize_t endinpos;
1010 Py_ssize_t outpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001011 const char *e;
1012 PyUnicodeObject *unicode;
1013 Py_UNICODE *p;
1014 const char *errmsg = "";
1015 int inShift = 0;
1016 unsigned int bitsleft = 0;
1017 unsigned long charsleft = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001018 int surrogate = 0;
1019 PyObject *errorHandler = NULL;
1020 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001021
1022 unicode = _PyUnicode_New(size);
1023 if (!unicode)
1024 return NULL;
1025 if (size == 0)
1026 return (PyObject *)unicode;
1027
1028 p = unicode->str;
1029 e = s + size;
1030
1031 while (s < e) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001032 Py_UNICODE ch;
1033 restart:
1034 ch = *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001035
1036 if (inShift) {
1037 if ((ch == '-') || !B64CHAR(ch)) {
1038 inShift = 0;
1039 s++;
Tim Petersced69f82003-09-16 20:30:58 +00001040
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001041 /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
1042 if (bitsleft >= 6) {
1043 /* The shift sequence has a partial character in it. If
1044 bitsleft < 6 then we could just classify it as padding
1045 but that is not the case here */
1046
1047 errmsg = "partial character in shift sequence";
Tim Petersced69f82003-09-16 20:30:58 +00001048 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001049 }
1050 /* According to RFC2152 the remaining bits should be zero. We
Tim Petersced69f82003-09-16 20:30:58 +00001051 choose to signal an error/insert a replacement character
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001052 here so indicate the potential of a misencoded character. */
1053
1054 /* On x86, a << b == a << (b%32) so make sure that bitsleft != 0 */
1055 if (bitsleft && charsleft << (sizeof(charsleft) * 8 - bitsleft)) {
1056 errmsg = "non-zero padding bits in shift sequence";
Tim Petersced69f82003-09-16 20:30:58 +00001057 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001058 }
1059
1060 if (ch == '-') {
1061 if ((s < e) && (*(s) == '-')) {
Tim Petersced69f82003-09-16 20:30:58 +00001062 *p++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001063 inShift = 1;
1064 }
1065 } else if (SPECIAL(ch,0,0)) {
1066 errmsg = "unexpected special character";
Tim Petersced69f82003-09-16 20:30:58 +00001067 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001068 } else {
1069 *p++ = ch;
1070 }
1071 } else {
1072 charsleft = (charsleft << 6) | UB64(ch);
1073 bitsleft += 6;
1074 s++;
1075 /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
1076 }
1077 }
1078 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001079 startinpos = s-starts;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001080 s++;
1081 if (s < e && *s == '-') {
1082 s++;
1083 *p++ = '+';
1084 } else
1085 {
1086 inShift = 1;
1087 bitsleft = 0;
1088 }
1089 }
1090 else if (SPECIAL(ch,0,0)) {
1091 errmsg = "unexpected special character";
1092 s++;
Tim Petersced69f82003-09-16 20:30:58 +00001093 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001094 }
1095 else {
1096 *p++ = ch;
1097 s++;
1098 }
1099 continue;
1100 utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001101 outpos = p-PyUnicode_AS_UNICODE(unicode);
1102 endinpos = s-starts;
1103 if (unicode_decode_call_errorhandler(
1104 errors, &errorHandler,
1105 "utf7", errmsg,
1106 starts, size, &startinpos, &endinpos, &exc, &s,
1107 (PyObject **)&unicode, &outpos, &p))
1108 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001109 }
1110
1111 if (inShift) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001112 outpos = p-PyUnicode_AS_UNICODE(unicode);
1113 endinpos = size;
1114 if (unicode_decode_call_errorhandler(
1115 errors, &errorHandler,
1116 "utf7", "unterminated shift sequence",
1117 starts, size, &startinpos, &endinpos, &exc, &s,
1118 (PyObject **)&unicode, &outpos, &p))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001119 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001120 if (s < e)
1121 goto restart;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001122 }
1123
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00001124 if (_PyUnicode_Resize(&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001125 goto onError;
1126
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001127 Py_XDECREF(errorHandler);
1128 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001129 return (PyObject *)unicode;
1130
1131onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001132 Py_XDECREF(errorHandler);
1133 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001134 Py_DECREF(unicode);
1135 return NULL;
1136}
1137
1138
1139PyObject *PyUnicode_EncodeUTF7(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001140 Py_ssize_t size,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001141 int encodeSetO,
1142 int encodeWhiteSpace,
1143 const char *errors)
1144{
1145 PyObject *v;
1146 /* It might be possible to tighten this worst case */
Martin v. Löwis18e16552006-02-15 17:27:45 +00001147 Py_ssize_t cbAllocated = 5 * size;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001148 int inShift = 0;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001149 Py_ssize_t i = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001150 unsigned int bitsleft = 0;
1151 unsigned long charsleft = 0;
1152 char * out;
1153 char * start;
1154
1155 if (size == 0)
Walter Dörwald51ab4142007-05-05 14:43:36 +00001156 return PyBytes_FromStringAndSize(NULL, 0);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001157
Walter Dörwald51ab4142007-05-05 14:43:36 +00001158 v = PyBytes_FromStringAndSize(NULL, cbAllocated);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001159 if (v == NULL)
1160 return NULL;
1161
Walter Dörwald51ab4142007-05-05 14:43:36 +00001162 start = out = PyBytes_AS_STRING(v);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001163 for (;i < size; ++i) {
1164 Py_UNICODE ch = s[i];
1165
1166 if (!inShift) {
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001167 if (ch == '+') {
1168 *out++ = '+';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001169 *out++ = '-';
1170 } else if (SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
1171 charsleft = ch;
1172 bitsleft = 16;
1173 *out++ = '+';
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001174 /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001175 inShift = bitsleft > 0;
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001176 } else {
1177 *out++ = (char) ch;
1178 }
1179 } else {
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001180 if (!SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
1181 *out++ = B64(charsleft << (6-bitsleft));
1182 charsleft = 0;
1183 bitsleft = 0;
1184 /* Characters not in the BASE64 set implicitly unshift the sequence
1185 so no '-' is required, except if the character is itself a '-' */
1186 if (B64CHAR(ch) || ch == '-') {
1187 *out++ = '-';
1188 }
1189 inShift = 0;
1190 *out++ = (char) ch;
1191 } else {
1192 bitsleft += 16;
1193 charsleft = (charsleft << 16) | ch;
1194 /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
1195
1196 /* If the next character is special then we dont' need to terminate
Tim Petersced69f82003-09-16 20:30:58 +00001197 the shift sequence. If the next character is not a BASE64 character
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001198 or '-' then the shift sequence will be terminated implicitly and we
1199 don't have to insert a '-'. */
1200
1201 if (bitsleft == 0) {
1202 if (i + 1 < size) {
1203 Py_UNICODE ch2 = s[i+1];
1204
1205 if (SPECIAL(ch2, encodeSetO, encodeWhiteSpace)) {
Tim Petersced69f82003-09-16 20:30:58 +00001206
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001207 } else if (B64CHAR(ch2) || ch2 == '-') {
1208 *out++ = '-';
1209 inShift = 0;
1210 } else {
1211 inShift = 0;
1212 }
1213
1214 }
1215 else {
1216 *out++ = '-';
1217 inShift = 0;
1218 }
1219 }
Tim Petersced69f82003-09-16 20:30:58 +00001220 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001221 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001222 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001223 if (bitsleft) {
1224 *out++= B64(charsleft << (6-bitsleft) );
1225 *out++ = '-';
1226 }
1227
Walter Dörwald51ab4142007-05-05 14:43:36 +00001228 if (PyBytes_Resize(v, out - start)) {
1229 Py_DECREF(v);
1230 return NULL;
1231 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001232 return v;
1233}
1234
1235#undef SPECIAL
1236#undef B64
1237#undef B64CHAR
1238#undef UB64
1239#undef ENCODE
1240#undef DECODE
1241
Guido van Rossumd57fd912000-03-10 22:53:23 +00001242/* --- UTF-8 Codec -------------------------------------------------------- */
1243
Tim Petersced69f82003-09-16 20:30:58 +00001244static
Guido van Rossumd57fd912000-03-10 22:53:23 +00001245char utf8_code_length[256] = {
1246 /* Map UTF-8 encoded prefix byte to sequence length. zero means
1247 illegal prefix. see RFC 2279 for details */
1248 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1249 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1250 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1251 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1252 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1253 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1254 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1255 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1256 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1257 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1258 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1259 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1260 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1261 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1262 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
1263 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0
1264};
1265
Guido van Rossumd57fd912000-03-10 22:53:23 +00001266PyObject *PyUnicode_DecodeUTF8(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001267 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00001268 const char *errors)
1269{
Walter Dörwald69652032004-09-07 20:24:22 +00001270 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
1271}
1272
1273PyObject *PyUnicode_DecodeUTF8Stateful(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001274 Py_ssize_t size,
Walter Dörwald69652032004-09-07 20:24:22 +00001275 const char *errors,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001276 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00001277{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001278 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001279 int n;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001280 Py_ssize_t startinpos;
1281 Py_ssize_t endinpos;
1282 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001283 const char *e;
1284 PyUnicodeObject *unicode;
1285 Py_UNICODE *p;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001286 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001287 PyObject *errorHandler = NULL;
1288 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001289
1290 /* Note: size will always be longer than the resulting Unicode
1291 character count */
1292 unicode = _PyUnicode_New(size);
1293 if (!unicode)
1294 return NULL;
Walter Dörwald69652032004-09-07 20:24:22 +00001295 if (size == 0) {
1296 if (consumed)
1297 *consumed = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001298 return (PyObject *)unicode;
Walter Dörwald69652032004-09-07 20:24:22 +00001299 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001300
1301 /* Unpack UTF-8 encoded data */
1302 p = unicode->str;
1303 e = s + size;
1304
1305 while (s < e) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001306 Py_UCS4 ch = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001307
1308 if (ch < 0x80) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001309 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001310 s++;
1311 continue;
1312 }
1313
1314 n = utf8_code_length[ch];
1315
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001316 if (s + n > e) {
Walter Dörwald69652032004-09-07 20:24:22 +00001317 if (consumed)
1318 break;
1319 else {
1320 errmsg = "unexpected end of data";
1321 startinpos = s-starts;
1322 endinpos = size;
1323 goto utf8Error;
1324 }
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001325 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001326
1327 switch (n) {
1328
1329 case 0:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001330 errmsg = "unexpected code byte";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001331 startinpos = s-starts;
1332 endinpos = startinpos+1;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001333 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001334
1335 case 1:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001336 errmsg = "internal error";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001337 startinpos = s-starts;
1338 endinpos = startinpos+1;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001339 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001340
1341 case 2:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001342 if ((s[1] & 0xc0) != 0x80) {
1343 errmsg = "invalid data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001344 startinpos = s-starts;
1345 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001346 goto utf8Error;
1347 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001348 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001349 if (ch < 0x80) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001350 startinpos = s-starts;
1351 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001352 errmsg = "illegal encoding";
1353 goto utf8Error;
1354 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001355 else
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001356 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001357 break;
1358
1359 case 3:
Tim Petersced69f82003-09-16 20:30:58 +00001360 if ((s[1] & 0xc0) != 0x80 ||
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001361 (s[2] & 0xc0) != 0x80) {
1362 errmsg = "invalid data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001363 startinpos = s-starts;
1364 endinpos = startinpos+3;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001365 goto utf8Error;
1366 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001367 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001368 if (ch < 0x0800) {
1369 /* Note: UTF-8 encodings of surrogates are considered
Tim Petersced69f82003-09-16 20:30:58 +00001370 legal UTF-8 sequences;
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001371
1372 XXX For wide builds (UCS-4) we should probably try
1373 to recombine the surrogates into a single code
1374 unit.
1375 */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001376 errmsg = "illegal encoding";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001377 startinpos = s-starts;
1378 endinpos = startinpos+3;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001379 goto utf8Error;
1380 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001381 else
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001382 *p++ = (Py_UNICODE)ch;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001383 break;
1384
1385 case 4:
1386 if ((s[1] & 0xc0) != 0x80 ||
1387 (s[2] & 0xc0) != 0x80 ||
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001388 (s[3] & 0xc0) != 0x80) {
1389 errmsg = "invalid data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001390 startinpos = s-starts;
1391 endinpos = startinpos+4;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001392 goto utf8Error;
1393 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001394 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
1395 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
1396 /* validate and convert to UTF-16 */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001397 if ((ch < 0x10000) /* minimum value allowed for 4
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001398 byte encoding */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001399 || (ch > 0x10ffff)) /* maximum value allowed for
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001400 UTF-16 */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001401 {
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001402 errmsg = "illegal encoding";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001403 startinpos = s-starts;
1404 endinpos = startinpos+4;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001405 goto utf8Error;
1406 }
Fredrik Lundh8f455852001-06-27 18:59:43 +00001407#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001408 *p++ = (Py_UNICODE)ch;
1409#else
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001410 /* compute and append the two surrogates: */
Tim Petersced69f82003-09-16 20:30:58 +00001411
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001412 /* translate from 10000..10FFFF to 0..FFFF */
1413 ch -= 0x10000;
Tim Petersced69f82003-09-16 20:30:58 +00001414
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001415 /* high surrogate = top 10 bits added to D800 */
1416 *p++ = (Py_UNICODE)(0xD800 + (ch >> 10));
Tim Petersced69f82003-09-16 20:30:58 +00001417
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001418 /* low surrogate = bottom 10 bits added to DC00 */
Fredrik Lundh45714e92001-06-26 16:39:36 +00001419 *p++ = (Py_UNICODE)(0xDC00 + (ch & 0x03FF));
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001420#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00001421 break;
1422
1423 default:
1424 /* Other sizes are only needed for UCS-4 */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001425 errmsg = "unsupported Unicode code range";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001426 startinpos = s-starts;
1427 endinpos = startinpos+n;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001428 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001429 }
1430 s += n;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001431 continue;
Tim Petersced69f82003-09-16 20:30:58 +00001432
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001433 utf8Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001434 outpos = p-PyUnicode_AS_UNICODE(unicode);
1435 if (unicode_decode_call_errorhandler(
1436 errors, &errorHandler,
1437 "utf8", errmsg,
1438 starts, size, &startinpos, &endinpos, &exc, &s,
1439 (PyObject **)&unicode, &outpos, &p))
1440 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001441 }
Walter Dörwald69652032004-09-07 20:24:22 +00001442 if (consumed)
1443 *consumed = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001444
1445 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00001446 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001447 goto onError;
1448
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001449 Py_XDECREF(errorHandler);
1450 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001451 return (PyObject *)unicode;
1452
1453onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001454 Py_XDECREF(errorHandler);
1455 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001456 Py_DECREF(unicode);
1457 return NULL;
1458}
1459
Tim Peters602f7402002-04-27 18:03:26 +00001460/* Allocation strategy: if the string is short, convert into a stack buffer
1461 and allocate exactly as much space needed at the end. Else allocate the
1462 maximum possible needed (4 result bytes per Unicode character), and return
1463 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001464*/
Tim Peters7e3d9612002-04-21 03:26:37 +00001465PyObject *
1466PyUnicode_EncodeUTF8(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001467 Py_ssize_t size,
Tim Peters7e3d9612002-04-21 03:26:37 +00001468 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001469{
Tim Peters602f7402002-04-27 18:03:26 +00001470#define MAX_SHORT_UNICHARS 300 /* largest size we'll do on the stack */
Tim Peters0eca65c2002-04-21 17:28:06 +00001471
Martin v. Löwis18e16552006-02-15 17:27:45 +00001472 Py_ssize_t i; /* index into s of next input byte */
Tim Peters602f7402002-04-27 18:03:26 +00001473 PyObject *v; /* result string object */
1474 char *p; /* next free byte in output buffer */
Martin v. Löwis18e16552006-02-15 17:27:45 +00001475 Py_ssize_t nallocated; /* number of result bytes allocated */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001476 Py_ssize_t nneeded; /* number of result bytes needed */
Tim Peters602f7402002-04-27 18:03:26 +00001477 char stackbuf[MAX_SHORT_UNICHARS * 4];
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001478
Tim Peters602f7402002-04-27 18:03:26 +00001479 assert(s != NULL);
1480 assert(size >= 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001481
Tim Peters602f7402002-04-27 18:03:26 +00001482 if (size <= MAX_SHORT_UNICHARS) {
1483 /* Write into the stack buffer; nallocated can't overflow.
1484 * At the end, we'll allocate exactly as much heap space as it
1485 * turns out we need.
1486 */
1487 nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int);
1488 v = NULL; /* will allocate after we're done */
1489 p = stackbuf;
1490 }
1491 else {
1492 /* Overallocate on the heap, and give the excess back at the end. */
1493 nallocated = size * 4;
1494 if (nallocated / 4 != size) /* overflow! */
1495 return PyErr_NoMemory();
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001496 v = PyBytes_FromStringAndSize(NULL, nallocated);
Tim Peters602f7402002-04-27 18:03:26 +00001497 if (v == NULL)
1498 return NULL;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001499 p = PyBytes_AS_STRING(v);
Tim Peters602f7402002-04-27 18:03:26 +00001500 }
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001501
Tim Peters602f7402002-04-27 18:03:26 +00001502 for (i = 0; i < size;) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001503 Py_UCS4 ch = s[i++];
Marc-André Lemburg3688a882002-02-06 18:09:02 +00001504
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001505 if (ch < 0x80)
Tim Peters602f7402002-04-27 18:03:26 +00001506 /* Encode ASCII */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001507 *p++ = (char) ch;
Marc-André Lemburg3688a882002-02-06 18:09:02 +00001508
Guido van Rossumd57fd912000-03-10 22:53:23 +00001509 else if (ch < 0x0800) {
Tim Peters602f7402002-04-27 18:03:26 +00001510 /* Encode Latin-1 */
Marc-André Lemburgdc724d62002-02-06 18:20:19 +00001511 *p++ = (char)(0xc0 | (ch >> 6));
1512 *p++ = (char)(0x80 | (ch & 0x3f));
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001513 }
Marc-André Lemburg3688a882002-02-06 18:09:02 +00001514 else {
Tim Peters602f7402002-04-27 18:03:26 +00001515 /* Encode UCS2 Unicode ordinals */
1516 if (ch < 0x10000) {
1517 /* Special case: check for high surrogate */
1518 if (0xD800 <= ch && ch <= 0xDBFF && i != size) {
1519 Py_UCS4 ch2 = s[i];
1520 /* Check for low surrogate and combine the two to
1521 form a UCS4 value */
1522 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001523 ch = ((ch - 0xD800) << 10 | (ch2 - 0xDC00)) + 0x10000;
Tim Peters602f7402002-04-27 18:03:26 +00001524 i++;
1525 goto encodeUCS4;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001526 }
Tim Peters602f7402002-04-27 18:03:26 +00001527 /* Fall through: handles isolated high surrogates */
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001528 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001529 *p++ = (char)(0xe0 | (ch >> 12));
Tim Peters602f7402002-04-27 18:03:26 +00001530 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
1531 *p++ = (char)(0x80 | (ch & 0x3f));
1532 continue;
1533 }
1534encodeUCS4:
1535 /* Encode UCS4 Unicode ordinals */
1536 *p++ = (char)(0xf0 | (ch >> 18));
1537 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
1538 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
1539 *p++ = (char)(0x80 | (ch & 0x3f));
1540 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001541 }
Tim Peters0eca65c2002-04-21 17:28:06 +00001542
Tim Peters602f7402002-04-27 18:03:26 +00001543 if (v == NULL) {
1544 /* This was stack allocated. */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001545 nneeded = p - stackbuf;
Tim Peters602f7402002-04-27 18:03:26 +00001546 assert(nneeded <= nallocated);
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001547 v = PyBytes_FromStringAndSize(stackbuf, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00001548 }
1549 else {
1550 /* Cut back to size actually needed. */
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001551 nneeded = p - PyBytes_AS_STRING(v);
Tim Peters602f7402002-04-27 18:03:26 +00001552 assert(nneeded <= nallocated);
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001553 PyBytes_Resize(v, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00001554 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001555 return v;
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001556
Tim Peters602f7402002-04-27 18:03:26 +00001557#undef MAX_SHORT_UNICHARS
Guido van Rossumd57fd912000-03-10 22:53:23 +00001558}
1559
Guido van Rossumd57fd912000-03-10 22:53:23 +00001560PyObject *PyUnicode_AsUTF8String(PyObject *unicode)
1561{
Guido van Rossumd57fd912000-03-10 22:53:23 +00001562 if (!PyUnicode_Check(unicode)) {
1563 PyErr_BadArgument();
1564 return NULL;
1565 }
Barry Warsaw2dd4abf2000-08-18 06:58:15 +00001566 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
1567 PyUnicode_GET_SIZE(unicode),
1568 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001569}
1570
1571/* --- UTF-16 Codec ------------------------------------------------------- */
1572
Tim Peters772747b2001-08-09 22:21:55 +00001573PyObject *
1574PyUnicode_DecodeUTF16(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001575 Py_ssize_t size,
Tim Peters772747b2001-08-09 22:21:55 +00001576 const char *errors,
1577 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001578{
Walter Dörwald69652032004-09-07 20:24:22 +00001579 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
1580}
1581
1582PyObject *
1583PyUnicode_DecodeUTF16Stateful(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001584 Py_ssize_t size,
Walter Dörwald69652032004-09-07 20:24:22 +00001585 const char *errors,
1586 int *byteorder,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001587 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00001588{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001589 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001590 Py_ssize_t startinpos;
1591 Py_ssize_t endinpos;
1592 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001593 PyUnicodeObject *unicode;
1594 Py_UNICODE *p;
Tim Peters772747b2001-08-09 22:21:55 +00001595 const unsigned char *q, *e;
1596 int bo = 0; /* assume native ordering by default */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001597 const char *errmsg = "";
Tim Peters772747b2001-08-09 22:21:55 +00001598 /* Offsets from q for retrieving byte pairs in the right order. */
1599#ifdef BYTEORDER_IS_LITTLE_ENDIAN
1600 int ihi = 1, ilo = 0;
1601#else
1602 int ihi = 0, ilo = 1;
1603#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001604 PyObject *errorHandler = NULL;
1605 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001606
1607 /* Note: size will always be longer than the resulting Unicode
1608 character count */
1609 unicode = _PyUnicode_New(size);
1610 if (!unicode)
1611 return NULL;
1612 if (size == 0)
1613 return (PyObject *)unicode;
1614
1615 /* Unpack UTF-16 encoded data */
1616 p = unicode->str;
Tim Peters772747b2001-08-09 22:21:55 +00001617 q = (unsigned char *)s;
1618 e = q + size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001619
1620 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00001621 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001622
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001623 /* Check for BOM marks (U+FEFF) in the input and adjust current
1624 byte order setting accordingly. In native mode, the leading BOM
1625 mark is skipped, in all other modes, it is copied to the output
1626 stream as-is (giving a ZWNBSP character). */
1627 if (bo == 0) {
Walter Dörwald69652032004-09-07 20:24:22 +00001628 if (size >= 2) {
1629 const Py_UNICODE bom = (q[ihi] << 8) | q[ilo];
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001630#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Walter Dörwald69652032004-09-07 20:24:22 +00001631 if (bom == 0xFEFF) {
1632 q += 2;
1633 bo = -1;
1634 }
1635 else if (bom == 0xFFFE) {
1636 q += 2;
1637 bo = 1;
1638 }
Tim Petersced69f82003-09-16 20:30:58 +00001639#else
Walter Dörwald69652032004-09-07 20:24:22 +00001640 if (bom == 0xFEFF) {
1641 q += 2;
1642 bo = 1;
1643 }
1644 else if (bom == 0xFFFE) {
1645 q += 2;
1646 bo = -1;
1647 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001648#endif
Walter Dörwald69652032004-09-07 20:24:22 +00001649 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001650 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001651
Tim Peters772747b2001-08-09 22:21:55 +00001652 if (bo == -1) {
1653 /* force LE */
1654 ihi = 1;
1655 ilo = 0;
1656 }
1657 else if (bo == 1) {
1658 /* force BE */
1659 ihi = 0;
1660 ilo = 1;
1661 }
1662
1663 while (q < e) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001664 Py_UNICODE ch;
Walter Dörwald69652032004-09-07 20:24:22 +00001665 /* remaining bytes at the end? (size should be even) */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001666 if (e-q<2) {
Walter Dörwald69652032004-09-07 20:24:22 +00001667 if (consumed)
1668 break;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001669 errmsg = "truncated data";
1670 startinpos = ((const char *)q)-starts;
1671 endinpos = ((const char *)e)-starts;
1672 goto utf16Error;
1673 /* The remaining input chars are ignored if the callback
1674 chooses to skip the input */
1675 }
1676 ch = (q[ihi] << 8) | q[ilo];
1677
Tim Peters772747b2001-08-09 22:21:55 +00001678 q += 2;
1679
Guido van Rossumd57fd912000-03-10 22:53:23 +00001680 if (ch < 0xD800 || ch > 0xDFFF) {
1681 *p++ = ch;
1682 continue;
1683 }
1684
1685 /* UTF-16 code pair: */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001686 if (q >= e) {
1687 errmsg = "unexpected end of data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001688 startinpos = (((const char *)q)-2)-starts;
1689 endinpos = ((const char *)e)-starts;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001690 goto utf16Error;
1691 }
Martin v. Löwisac93bc22001-06-26 22:43:40 +00001692 if (0xD800 <= ch && ch <= 0xDBFF) {
Tim Peters772747b2001-08-09 22:21:55 +00001693 Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo];
1694 q += 2;
Martin v. Löwisac93bc22001-06-26 22:43:40 +00001695 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Fredrik Lundh8f455852001-06-27 18:59:43 +00001696#ifndef Py_UNICODE_WIDE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001697 *p++ = ch;
1698 *p++ = ch2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001699#else
1700 *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001701#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001702 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001703 }
1704 else {
1705 errmsg = "illegal UTF-16 surrogate";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001706 startinpos = (((const char *)q)-4)-starts;
1707 endinpos = startinpos+2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001708 goto utf16Error;
1709 }
1710
Guido van Rossumd57fd912000-03-10 22:53:23 +00001711 }
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001712 errmsg = "illegal encoding";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001713 startinpos = (((const char *)q)-2)-starts;
1714 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001715 /* Fall through to report the error */
1716
1717 utf16Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001718 outpos = p-PyUnicode_AS_UNICODE(unicode);
1719 if (unicode_decode_call_errorhandler(
1720 errors, &errorHandler,
1721 "utf16", errmsg,
1722 starts, size, &startinpos, &endinpos, &exc, (const char **)&q,
1723 (PyObject **)&unicode, &outpos, &p))
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001724 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001725 }
1726
1727 if (byteorder)
1728 *byteorder = bo;
1729
Walter Dörwald69652032004-09-07 20:24:22 +00001730 if (consumed)
1731 *consumed = (const char *)q-starts;
1732
Guido van Rossumd57fd912000-03-10 22:53:23 +00001733 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00001734 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001735 goto onError;
1736
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001737 Py_XDECREF(errorHandler);
1738 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001739 return (PyObject *)unicode;
1740
1741onError:
1742 Py_DECREF(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001743 Py_XDECREF(errorHandler);
1744 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001745 return NULL;
1746}
1747
Tim Peters772747b2001-08-09 22:21:55 +00001748PyObject *
1749PyUnicode_EncodeUTF16(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001750 Py_ssize_t size,
Tim Peters772747b2001-08-09 22:21:55 +00001751 const char *errors,
1752 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001753{
1754 PyObject *v;
Tim Peters772747b2001-08-09 22:21:55 +00001755 unsigned char *p;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00001756#ifdef Py_UNICODE_WIDE
Tim Peters772747b2001-08-09 22:21:55 +00001757 int i, pairs;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00001758#else
1759 const int pairs = 0;
1760#endif
Tim Peters772747b2001-08-09 22:21:55 +00001761 /* Offsets from p for storing byte pairs in the right order. */
1762#ifdef BYTEORDER_IS_LITTLE_ENDIAN
1763 int ihi = 1, ilo = 0;
1764#else
1765 int ihi = 0, ilo = 1;
1766#endif
1767
1768#define STORECHAR(CH) \
1769 do { \
1770 p[ihi] = ((CH) >> 8) & 0xff; \
1771 p[ilo] = (CH) & 0xff; \
1772 p += 2; \
1773 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001774
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00001775#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001776 for (i = pairs = 0; i < size; i++)
1777 if (s[i] >= 0x10000)
1778 pairs++;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00001779#endif
Walter Dörwald3cc34522007-05-04 10:48:27 +00001780 v = PyBytes_FromStringAndSize(NULL,
Tim Peters772747b2001-08-09 22:21:55 +00001781 2 * (size + pairs + (byteorder == 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +00001782 if (v == NULL)
1783 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001784
Walter Dörwald3cc34522007-05-04 10:48:27 +00001785 p = (unsigned char *)PyBytes_AS_STRING(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001786 if (byteorder == 0)
Tim Peters772747b2001-08-09 22:21:55 +00001787 STORECHAR(0xFEFF);
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00001788 if (size == 0)
Marc-André Lemburgb7520772000-08-14 11:29:19 +00001789 return v;
Tim Peters772747b2001-08-09 22:21:55 +00001790
1791 if (byteorder == -1) {
1792 /* force LE */
1793 ihi = 1;
1794 ilo = 0;
1795 }
1796 else if (byteorder == 1) {
1797 /* force BE */
1798 ihi = 0;
1799 ilo = 1;
1800 }
1801
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001802 while (size-- > 0) {
1803 Py_UNICODE ch = *s++;
1804 Py_UNICODE ch2 = 0;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00001805#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001806 if (ch >= 0x10000) {
Tim Peters772747b2001-08-09 22:21:55 +00001807 ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF);
1808 ch = 0xD800 | ((ch-0x10000) >> 10);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001809 }
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00001810#endif
Tim Peters772747b2001-08-09 22:21:55 +00001811 STORECHAR(ch);
1812 if (ch2)
1813 STORECHAR(ch2);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001814 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001815 return v;
Tim Peters772747b2001-08-09 22:21:55 +00001816#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00001817}
1818
1819PyObject *PyUnicode_AsUTF16String(PyObject *unicode)
1820{
1821 if (!PyUnicode_Check(unicode)) {
1822 PyErr_BadArgument();
1823 return NULL;
1824 }
1825 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
1826 PyUnicode_GET_SIZE(unicode),
1827 NULL,
1828 0);
1829}
1830
1831/* --- Unicode Escape Codec ----------------------------------------------- */
1832
Fredrik Lundh06d12682001-01-24 07:59:11 +00001833static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00001834
Guido van Rossumd57fd912000-03-10 22:53:23 +00001835PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001836 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00001837 const char *errors)
1838{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001839 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001840 Py_ssize_t startinpos;
1841 Py_ssize_t endinpos;
1842 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001843 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001844 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001845 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001846 const char *end;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001847 char* message;
1848 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001849 PyObject *errorHandler = NULL;
1850 PyObject *exc = NULL;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001851
Guido van Rossumd57fd912000-03-10 22:53:23 +00001852 /* Escaped strings will always be longer than the resulting
1853 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001854 length after conversion to the true value.
1855 (but if the error callback returns a long replacement string
1856 we'll have to allocate more space) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001857 v = _PyUnicode_New(size);
1858 if (v == NULL)
1859 goto onError;
1860 if (size == 0)
1861 return (PyObject *)v;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001862
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001863 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001864 end = s + size;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001865
Guido van Rossumd57fd912000-03-10 22:53:23 +00001866 while (s < end) {
1867 unsigned char c;
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00001868 Py_UNICODE x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001869 int digits;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001870
1871 /* Non-escape characters are interpreted as Unicode ordinals */
1872 if (*s != '\\') {
Fredrik Lundhccc74732001-02-18 22:13:49 +00001873 *p++ = (unsigned char) *s++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001874 continue;
1875 }
1876
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001877 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001878 /* \ - Escapes */
1879 s++;
1880 switch (*s++) {
1881
1882 /* \x escapes */
1883 case '\n': break;
1884 case '\\': *p++ = '\\'; break;
1885 case '\'': *p++ = '\''; break;
1886 case '\"': *p++ = '\"'; break;
1887 case 'b': *p++ = '\b'; break;
1888 case 'f': *p++ = '\014'; break; /* FF */
1889 case 't': *p++ = '\t'; break;
1890 case 'n': *p++ = '\n'; break;
1891 case 'r': *p++ = '\r'; break;
1892 case 'v': *p++ = '\013'; break; /* VT */
1893 case 'a': *p++ = '\007'; break; /* BEL, not classic C */
1894
1895 /* \OOO (octal) escapes */
1896 case '0': case '1': case '2': case '3':
1897 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001898 x = s[-1] - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00001899 if ('0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001900 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00001901 if ('0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001902 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00001903 }
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001904 *p++ = x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001905 break;
1906
Fredrik Lundhccc74732001-02-18 22:13:49 +00001907 /* hex escapes */
1908 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001909 case 'x':
Fredrik Lundhccc74732001-02-18 22:13:49 +00001910 digits = 2;
1911 message = "truncated \\xXX escape";
1912 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001913
Fredrik Lundhccc74732001-02-18 22:13:49 +00001914 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001915 case 'u':
Fredrik Lundhccc74732001-02-18 22:13:49 +00001916 digits = 4;
1917 message = "truncated \\uXXXX escape";
1918 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001919
Fredrik Lundhccc74732001-02-18 22:13:49 +00001920 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00001921 case 'U':
Fredrik Lundhccc74732001-02-18 22:13:49 +00001922 digits = 8;
1923 message = "truncated \\UXXXXXXXX escape";
1924 hexescape:
1925 chr = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001926 outpos = p-PyUnicode_AS_UNICODE(v);
1927 if (s+digits>end) {
1928 endinpos = size;
1929 if (unicode_decode_call_errorhandler(
1930 errors, &errorHandler,
1931 "unicodeescape", "end of string in escape sequence",
1932 starts, size, &startinpos, &endinpos, &exc, &s,
1933 (PyObject **)&v, &outpos, &p))
1934 goto onError;
1935 goto nextByte;
1936 }
1937 for (i = 0; i < digits; ++i) {
Fredrik Lundhccc74732001-02-18 22:13:49 +00001938 c = (unsigned char) s[i];
Fredrik Lundhdf846752000-09-03 11:29:49 +00001939 if (!isxdigit(c)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001940 endinpos = (s+i+1)-starts;
1941 if (unicode_decode_call_errorhandler(
1942 errors, &errorHandler,
1943 "unicodeescape", message,
1944 starts, size, &startinpos, &endinpos, &exc, &s,
1945 (PyObject **)&v, &outpos, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00001946 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001947 goto nextByte;
Fredrik Lundhdf846752000-09-03 11:29:49 +00001948 }
1949 chr = (chr<<4) & ~0xF;
1950 if (c >= '0' && c <= '9')
1951 chr += c - '0';
1952 else if (c >= 'a' && c <= 'f')
1953 chr += 10 + c - 'a';
1954 else
1955 chr += 10 + c - 'A';
1956 }
1957 s += i;
Jeremy Hylton504de6b2003-10-06 05:08:26 +00001958 if (chr == 0xffffffff && PyErr_Occurred())
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001959 /* _decoding_error will have already written into the
1960 target buffer. */
1961 break;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001962 store:
Fredrik Lundhdf846752000-09-03 11:29:49 +00001963 /* when we get here, chr is a 32-bit unicode character */
1964 if (chr <= 0xffff)
1965 /* UCS-2 character */
1966 *p++ = (Py_UNICODE) chr;
1967 else if (chr <= 0x10ffff) {
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001968 /* UCS-4 character. Either store directly, or as
Walter Dörwald8c077222002-03-25 11:16:18 +00001969 surrogate pair. */
Fredrik Lundh8f455852001-06-27 18:59:43 +00001970#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001971 *p++ = chr;
1972#else
Fredrik Lundhdf846752000-09-03 11:29:49 +00001973 chr -= 0x10000L;
1974 *p++ = 0xD800 + (Py_UNICODE) (chr >> 10);
Fredrik Lundh45714e92001-06-26 16:39:36 +00001975 *p++ = 0xDC00 + (Py_UNICODE) (chr & 0x03FF);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001976#endif
Fredrik Lundhdf846752000-09-03 11:29:49 +00001977 } else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001978 endinpos = s-starts;
1979 outpos = p-PyUnicode_AS_UNICODE(v);
1980 if (unicode_decode_call_errorhandler(
1981 errors, &errorHandler,
1982 "unicodeescape", "illegal Unicode character",
1983 starts, size, &startinpos, &endinpos, &exc, &s,
1984 (PyObject **)&v, &outpos, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00001985 goto onError;
1986 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00001987 break;
1988
1989 /* \N{name} */
1990 case 'N':
1991 message = "malformed \\N character escape";
1992 if (ucnhash_CAPI == NULL) {
1993 /* load the unicode data module */
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00001994 PyObject *m, *api;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001995 m = PyImport_ImportModule("unicodedata");
1996 if (m == NULL)
1997 goto ucnhashError;
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00001998 api = PyObject_GetAttrString(m, "ucnhash_CAPI");
Fredrik Lundhccc74732001-02-18 22:13:49 +00001999 Py_DECREF(m);
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00002000 if (api == NULL)
Fredrik Lundhccc74732001-02-18 22:13:49 +00002001 goto ucnhashError;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002002 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCObject_AsVoidPtr(api);
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00002003 Py_DECREF(api);
Fredrik Lundhccc74732001-02-18 22:13:49 +00002004 if (ucnhash_CAPI == NULL)
2005 goto ucnhashError;
2006 }
2007 if (*s == '{') {
2008 const char *start = s+1;
2009 /* look for the closing brace */
2010 while (*s != '}' && s < end)
2011 s++;
2012 if (s > start && s < end && *s == '}') {
2013 /* found a name. look it up in the unicode database */
2014 message = "unknown Unicode character name";
2015 s++;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +00002016 if (ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1), &chr))
Fredrik Lundhccc74732001-02-18 22:13:49 +00002017 goto store;
2018 }
2019 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002020 endinpos = s-starts;
2021 outpos = p-PyUnicode_AS_UNICODE(v);
2022 if (unicode_decode_call_errorhandler(
2023 errors, &errorHandler,
2024 "unicodeescape", message,
2025 starts, size, &startinpos, &endinpos, &exc, &s,
2026 (PyObject **)&v, &outpos, &p))
Fredrik Lundhccc74732001-02-18 22:13:49 +00002027 goto onError;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002028 break;
2029
2030 default:
Walter Dörwald8c077222002-03-25 11:16:18 +00002031 if (s > end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002032 message = "\\ at end of string";
2033 s--;
2034 endinpos = s-starts;
2035 outpos = p-PyUnicode_AS_UNICODE(v);
2036 if (unicode_decode_call_errorhandler(
2037 errors, &errorHandler,
2038 "unicodeescape", message,
2039 starts, size, &startinpos, &endinpos, &exc, &s,
2040 (PyObject **)&v, &outpos, &p))
Walter Dörwald8c077222002-03-25 11:16:18 +00002041 goto onError;
2042 }
2043 else {
2044 *p++ = '\\';
2045 *p++ = (unsigned char)s[-1];
2046 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00002047 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002048 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002049 nextByte:
2050 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002051 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002052 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002053 goto onError;
Walter Dörwaldd4ade082003-08-15 15:00:26 +00002054 Py_XDECREF(errorHandler);
2055 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002056 return (PyObject *)v;
Walter Dörwald8c077222002-03-25 11:16:18 +00002057
Fredrik Lundhccc74732001-02-18 22:13:49 +00002058ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00002059 PyErr_SetString(
2060 PyExc_UnicodeError,
2061 "\\N escapes not supported (can't load unicodedata module)"
2062 );
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00002063 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002064 Py_XDECREF(errorHandler);
2065 Py_XDECREF(exc);
Fredrik Lundhf6056062001-01-20 11:15:25 +00002066 return NULL;
2067
Fredrik Lundhccc74732001-02-18 22:13:49 +00002068onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00002069 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002070 Py_XDECREF(errorHandler);
2071 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002072 return NULL;
2073}
2074
2075/* Return a Unicode-Escape string version of the Unicode object.
2076
2077 If quotes is true, the string is enclosed in u"" or u'' quotes as
2078 appropriate.
2079
2080*/
2081
Thomas Wouters477c8d52006-05-27 19:21:47 +00002082Py_LOCAL_INLINE(const Py_UNICODE *) findchar(const Py_UNICODE *s,
2083 Py_ssize_t size,
2084 Py_UNICODE ch)
2085{
2086 /* like wcschr, but doesn't stop at NULL characters */
2087
2088 while (size-- > 0) {
2089 if (*s == ch)
2090 return s;
2091 s++;
2092 }
2093
2094 return NULL;
2095}
Barry Warsaw51ac5802000-03-20 16:36:48 +00002096
Walter Dörwald79e913e2007-05-12 11:08:06 +00002097static const char *hexdigits = "0123456789abcdef";
2098
2099PyObject *PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
2100 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002101{
2102 PyObject *repr;
2103 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002104
Thomas Wouters89f507f2006-12-13 04:49:30 +00002105 /* XXX(nnorwitz): rather than over-allocating, it would be
2106 better to choose a different scheme. Perhaps scan the
2107 first N-chars of the string and allocate based on that size.
2108 */
2109 /* Initial allocation is based on the longest-possible unichr
2110 escape.
2111
2112 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
2113 unichr, so in this case it's the longest unichr escape. In
2114 narrow (UTF-16) builds this is five chars per source unichr
2115 since there are two unichrs in the surrogate pair, so in narrow
2116 (UTF-16) builds it's not the longest unichr escape.
2117
2118 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
2119 so in the narrow (UTF-16) build case it's the longest unichr
2120 escape.
2121 */
2122
Walter Dörwald79e913e2007-05-12 11:08:06 +00002123 repr = PyBytes_FromStringAndSize(NULL,
Thomas Wouters89f507f2006-12-13 04:49:30 +00002124#ifdef Py_UNICODE_WIDE
2125 + 10*size
2126#else
2127 + 6*size
2128#endif
2129 + 1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002130 if (repr == NULL)
2131 return NULL;
2132
Walter Dörwald79e913e2007-05-12 11:08:06 +00002133 p = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002134
Guido van Rossumd57fd912000-03-10 22:53:23 +00002135 while (size-- > 0) {
2136 Py_UNICODE ch = *s++;
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002137
Walter Dörwald79e913e2007-05-12 11:08:06 +00002138 /* Escape backslashes */
2139 if (ch == '\\') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002140 *p++ = '\\';
2141 *p++ = (char) ch;
Walter Dörwald79e913e2007-05-12 11:08:06 +00002142 continue;
Tim Petersced69f82003-09-16 20:30:58 +00002143 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002144
Guido van Rossum0d42e0c2001-07-20 16:36:21 +00002145#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002146 /* Map 21-bit characters to '\U00xxxxxx' */
2147 else if (ch >= 0x10000) {
2148 *p++ = '\\';
2149 *p++ = 'U';
Walter Dörwald79e913e2007-05-12 11:08:06 +00002150 *p++ = hexdigits[(ch >> 28) & 0x0000000F];
2151 *p++ = hexdigits[(ch >> 24) & 0x0000000F];
2152 *p++ = hexdigits[(ch >> 20) & 0x0000000F];
2153 *p++ = hexdigits[(ch >> 16) & 0x0000000F];
2154 *p++ = hexdigits[(ch >> 12) & 0x0000000F];
2155 *p++ = hexdigits[(ch >> 8) & 0x0000000F];
2156 *p++ = hexdigits[(ch >> 4) & 0x0000000F];
2157 *p++ = hexdigits[ch & 0x0000000F];
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002158 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002159 }
Thomas Wouters89f507f2006-12-13 04:49:30 +00002160#else
2161 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002162 else if (ch >= 0xD800 && ch < 0xDC00) {
2163 Py_UNICODE ch2;
2164 Py_UCS4 ucs;
Tim Petersced69f82003-09-16 20:30:58 +00002165
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002166 ch2 = *s++;
2167 size--;
2168 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
2169 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
2170 *p++ = '\\';
2171 *p++ = 'U';
Walter Dörwald79e913e2007-05-12 11:08:06 +00002172 *p++ = hexdigits[(ucs >> 28) & 0x0000000F];
2173 *p++ = hexdigits[(ucs >> 24) & 0x0000000F];
2174 *p++ = hexdigits[(ucs >> 20) & 0x0000000F];
2175 *p++ = hexdigits[(ucs >> 16) & 0x0000000F];
2176 *p++ = hexdigits[(ucs >> 12) & 0x0000000F];
2177 *p++ = hexdigits[(ucs >> 8) & 0x0000000F];
2178 *p++ = hexdigits[(ucs >> 4) & 0x0000000F];
2179 *p++ = hexdigits[ucs & 0x0000000F];
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002180 continue;
2181 }
2182 /* Fall through: isolated surrogates are copied as-is */
2183 s--;
2184 size++;
2185 }
Thomas Wouters89f507f2006-12-13 04:49:30 +00002186#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002187
Guido van Rossumd57fd912000-03-10 22:53:23 +00002188 /* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002189 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002190 *p++ = '\\';
2191 *p++ = 'u';
Walter Dörwald79e913e2007-05-12 11:08:06 +00002192 *p++ = hexdigits[(ch >> 12) & 0x000F];
2193 *p++ = hexdigits[(ch >> 8) & 0x000F];
2194 *p++ = hexdigits[(ch >> 4) & 0x000F];
2195 *p++ = hexdigits[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00002196 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002197
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00002198 /* Map special whitespace to '\t', \n', '\r' */
2199 else if (ch == '\t') {
2200 *p++ = '\\';
2201 *p++ = 't';
2202 }
2203 else if (ch == '\n') {
2204 *p++ = '\\';
2205 *p++ = 'n';
2206 }
2207 else if (ch == '\r') {
2208 *p++ = '\\';
2209 *p++ = 'r';
2210 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002211
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00002212 /* Map non-printable US ASCII to '\xhh' */
Marc-André Lemburg11326de2001-11-28 12:56:20 +00002213 else if (ch < ' ' || ch >= 0x7F) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002214 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00002215 *p++ = 'x';
Walter Dörwald79e913e2007-05-12 11:08:06 +00002216 *p++ = hexdigits[(ch >> 4) & 0x000F];
2217 *p++ = hexdigits[ch & 0x000F];
Tim Petersced69f82003-09-16 20:30:58 +00002218 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002219
Guido van Rossumd57fd912000-03-10 22:53:23 +00002220 /* Copy everything else as-is */
2221 else
2222 *p++ = (char) ch;
2223 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002224
2225 *p = '\0';
Walter Dörwald79e913e2007-05-12 11:08:06 +00002226 if (PyBytes_Resize(repr, p - PyBytes_AS_STRING(repr))) {
2227 Py_DECREF(repr);
2228 return NULL;
2229 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002230 return repr;
2231}
2232
Guido van Rossumd57fd912000-03-10 22:53:23 +00002233PyObject *PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
2234{
Walter Dörwald79e913e2007-05-12 11:08:06 +00002235 PyObject *s, *result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002236 if (!PyUnicode_Check(unicode)) {
2237 PyErr_BadArgument();
2238 return NULL;
2239 }
Walter Dörwald79e913e2007-05-12 11:08:06 +00002240 s = PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
2241 PyUnicode_GET_SIZE(unicode));
2242
2243 if (!s)
2244 return NULL;
2245 result = PyString_FromStringAndSize(PyBytes_AS_STRING(s),
2246 PyBytes_GET_SIZE(s));
2247 Py_DECREF(s);
2248 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002249}
2250
2251/* --- Raw Unicode Escape Codec ------------------------------------------- */
2252
2253PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002254 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00002255 const char *errors)
2256{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002257 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002258 Py_ssize_t startinpos;
2259 Py_ssize_t endinpos;
2260 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002261 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002262 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002263 const char *end;
2264 const char *bs;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002265 PyObject *errorHandler = NULL;
2266 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00002267
Guido van Rossumd57fd912000-03-10 22:53:23 +00002268 /* Escaped strings will always be longer than the resulting
2269 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002270 length after conversion to the true value. (But decoding error
2271 handler might have to resize the string) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002272 v = _PyUnicode_New(size);
2273 if (v == NULL)
2274 goto onError;
2275 if (size == 0)
2276 return (PyObject *)v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002277 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002278 end = s + size;
2279 while (s < end) {
2280 unsigned char c;
Martin v. Löwis047c05e2002-03-21 08:55:28 +00002281 Py_UCS4 x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002282 int i;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002283 int count;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002284
2285 /* Non-escape characters are interpreted as Unicode ordinals */
2286 if (*s != '\\') {
2287 *p++ = (unsigned char)*s++;
2288 continue;
2289 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002290 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002291
2292 /* \u-escapes are only interpreted iff the number of leading
2293 backslashes if odd */
2294 bs = s;
2295 for (;s < end;) {
2296 if (*s != '\\')
2297 break;
2298 *p++ = (unsigned char)*s++;
2299 }
2300 if (((s - bs) & 1) == 0 ||
2301 s >= end ||
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002302 (*s != 'u' && *s != 'U')) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002303 continue;
2304 }
2305 p--;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002306 count = *s=='u' ? 4 : 8;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002307 s++;
2308
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002309 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002310 outpos = p-PyUnicode_AS_UNICODE(v);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002311 for (x = 0, i = 0; i < count; ++i, ++s) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002312 c = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002313 if (!isxdigit(c)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002314 endinpos = s-starts;
2315 if (unicode_decode_call_errorhandler(
2316 errors, &errorHandler,
2317 "rawunicodeescape", "truncated \\uXXXX",
2318 starts, size, &startinpos, &endinpos, &exc, &s,
2319 (PyObject **)&v, &outpos, &p))
Guido van Rossumd57fd912000-03-10 22:53:23 +00002320 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002321 goto nextByte;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002322 }
2323 x = (x<<4) & ~0xF;
2324 if (c >= '0' && c <= '9')
2325 x += c - '0';
2326 else if (c >= 'a' && c <= 'f')
2327 x += 10 + c - 'a';
2328 else
2329 x += 10 + c - 'A';
2330 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002331#ifndef Py_UNICODE_WIDE
2332 if (x > 0x10000) {
2333 if (unicode_decode_call_errorhandler(
2334 errors, &errorHandler,
2335 "rawunicodeescape", "\\Uxxxxxxxx out of range",
2336 starts, size, &startinpos, &endinpos, &exc, &s,
2337 (PyObject **)&v, &outpos, &p))
2338 goto onError;
2339 }
2340#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002341 *p++ = x;
2342 nextByte:
2343 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002344 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002345 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002346 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002347 Py_XDECREF(errorHandler);
2348 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002349 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00002350
Guido van Rossumd57fd912000-03-10 22:53:23 +00002351 onError:
2352 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002353 Py_XDECREF(errorHandler);
2354 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002355 return NULL;
2356}
2357
2358PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002359 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002360{
2361 PyObject *repr;
2362 char *p;
2363 char *q;
2364
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00002365 static const char *hexdigit = "0123456789abcdef";
Guido van Rossumd57fd912000-03-10 22:53:23 +00002366
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002367#ifdef Py_UNICODE_WIDE
2368 repr = PyString_FromStringAndSize(NULL, 10 * size);
2369#else
Guido van Rossumd57fd912000-03-10 22:53:23 +00002370 repr = PyString_FromStringAndSize(NULL, 6 * size);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002371#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00002372 if (repr == NULL)
2373 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00002374 if (size == 0)
2375 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002376
2377 p = q = PyString_AS_STRING(repr);
2378 while (size-- > 0) {
2379 Py_UNICODE ch = *s++;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002380#ifdef Py_UNICODE_WIDE
2381 /* Map 32-bit characters to '\Uxxxxxxxx' */
2382 if (ch >= 0x10000) {
2383 *p++ = '\\';
2384 *p++ = 'U';
2385 *p++ = hexdigit[(ch >> 28) & 0xf];
2386 *p++ = hexdigit[(ch >> 24) & 0xf];
2387 *p++ = hexdigit[(ch >> 20) & 0xf];
2388 *p++ = hexdigit[(ch >> 16) & 0xf];
2389 *p++ = hexdigit[(ch >> 12) & 0xf];
2390 *p++ = hexdigit[(ch >> 8) & 0xf];
2391 *p++ = hexdigit[(ch >> 4) & 0xf];
2392 *p++ = hexdigit[ch & 15];
Tim Petersced69f82003-09-16 20:30:58 +00002393 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002394 else
2395#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00002396 /* Map 16-bit characters to '\uxxxx' */
2397 if (ch >= 256) {
2398 *p++ = '\\';
2399 *p++ = 'u';
2400 *p++ = hexdigit[(ch >> 12) & 0xf];
2401 *p++ = hexdigit[(ch >> 8) & 0xf];
2402 *p++ = hexdigit[(ch >> 4) & 0xf];
2403 *p++ = hexdigit[ch & 15];
2404 }
2405 /* Copy everything else as-is */
2406 else
2407 *p++ = (char) ch;
2408 }
2409 *p = '\0';
Tim Peters5de98422002-04-27 18:44:32 +00002410 _PyString_Resize(&repr, p - q);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002411 return repr;
2412}
2413
2414PyObject *PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
2415{
2416 if (!PyUnicode_Check(unicode)) {
2417 PyErr_BadArgument();
2418 return NULL;
2419 }
2420 return PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
2421 PyUnicode_GET_SIZE(unicode));
2422}
2423
Walter Dörwalda47d1c02005-08-30 10:23:14 +00002424/* --- Unicode Internal Codec ------------------------------------------- */
2425
2426PyObject *_PyUnicode_DecodeUnicodeInternal(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002427 Py_ssize_t size,
Walter Dörwalda47d1c02005-08-30 10:23:14 +00002428 const char *errors)
2429{
2430 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002431 Py_ssize_t startinpos;
2432 Py_ssize_t endinpos;
2433 Py_ssize_t outpos;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00002434 PyUnicodeObject *v;
2435 Py_UNICODE *p;
2436 const char *end;
2437 const char *reason;
2438 PyObject *errorHandler = NULL;
2439 PyObject *exc = NULL;
2440
Neal Norwitzd43069c2006-01-08 01:12:10 +00002441#ifdef Py_UNICODE_WIDE
2442 Py_UNICODE unimax = PyUnicode_GetMax();
2443#endif
2444
Thomas Wouters89f507f2006-12-13 04:49:30 +00002445 /* XXX overflow detection missing */
Walter Dörwalda47d1c02005-08-30 10:23:14 +00002446 v = _PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE);
2447 if (v == NULL)
2448 goto onError;
2449 if (PyUnicode_GetSize((PyObject *)v) == 0)
2450 return (PyObject *)v;
2451 p = PyUnicode_AS_UNICODE(v);
2452 end = s + size;
2453
2454 while (s < end) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00002455 memcpy(p, s, sizeof(Py_UNICODE));
Walter Dörwalda47d1c02005-08-30 10:23:14 +00002456 /* We have to sanity check the raw data, otherwise doom looms for
2457 some malformed UCS-4 data. */
2458 if (
2459 #ifdef Py_UNICODE_WIDE
2460 *p > unimax || *p < 0 ||
2461 #endif
2462 end-s < Py_UNICODE_SIZE
2463 )
2464 {
2465 startinpos = s - starts;
2466 if (end-s < Py_UNICODE_SIZE) {
2467 endinpos = end-starts;
2468 reason = "truncated input";
2469 }
2470 else {
2471 endinpos = s - starts + Py_UNICODE_SIZE;
2472 reason = "illegal code point (> 0x10FFFF)";
2473 }
2474 outpos = p - PyUnicode_AS_UNICODE(v);
2475 if (unicode_decode_call_errorhandler(
2476 errors, &errorHandler,
2477 "unicode_internal", reason,
2478 starts, size, &startinpos, &endinpos, &exc, &s,
2479 (PyObject **)&v, &outpos, &p)) {
2480 goto onError;
2481 }
2482 }
2483 else {
2484 p++;
2485 s += Py_UNICODE_SIZE;
2486 }
2487 }
2488
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002489 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00002490 goto onError;
2491 Py_XDECREF(errorHandler);
2492 Py_XDECREF(exc);
2493 return (PyObject *)v;
2494
2495 onError:
2496 Py_XDECREF(v);
2497 Py_XDECREF(errorHandler);
2498 Py_XDECREF(exc);
2499 return NULL;
2500}
2501
Guido van Rossumd57fd912000-03-10 22:53:23 +00002502/* --- Latin-1 Codec ------------------------------------------------------ */
2503
2504PyObject *PyUnicode_DecodeLatin1(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002505 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00002506 const char *errors)
2507{
2508 PyUnicodeObject *v;
2509 Py_UNICODE *p;
Tim Petersced69f82003-09-16 20:30:58 +00002510
Guido van Rossumd57fd912000-03-10 22:53:23 +00002511 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002512 if (size == 1) {
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002513 Py_UNICODE r = *(unsigned char*)s;
2514 return PyUnicode_FromUnicode(&r, 1);
2515 }
2516
Guido van Rossumd57fd912000-03-10 22:53:23 +00002517 v = _PyUnicode_New(size);
2518 if (v == NULL)
2519 goto onError;
2520 if (size == 0)
2521 return (PyObject *)v;
2522 p = PyUnicode_AS_UNICODE(v);
2523 while (size-- > 0)
2524 *p++ = (unsigned char)*s++;
2525 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00002526
Guido van Rossumd57fd912000-03-10 22:53:23 +00002527 onError:
2528 Py_XDECREF(v);
2529 return NULL;
2530}
2531
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002532/* create or adjust a UnicodeEncodeError */
2533static void make_encode_exception(PyObject **exceptionObject,
2534 const char *encoding,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002535 const Py_UNICODE *unicode, Py_ssize_t size,
2536 Py_ssize_t startpos, Py_ssize_t endpos,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002537 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002538{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002539 if (*exceptionObject == NULL) {
2540 *exceptionObject = PyUnicodeEncodeError_Create(
2541 encoding, unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002542 }
2543 else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002544 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
2545 goto onError;
2546 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
2547 goto onError;
2548 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
2549 goto onError;
2550 return;
2551 onError:
2552 Py_DECREF(*exceptionObject);
2553 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002554 }
2555}
2556
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002557/* raises a UnicodeEncodeError */
2558static void raise_encode_exception(PyObject **exceptionObject,
2559 const char *encoding,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002560 const Py_UNICODE *unicode, Py_ssize_t size,
2561 Py_ssize_t startpos, Py_ssize_t endpos,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002562 const char *reason)
2563{
2564 make_encode_exception(exceptionObject,
2565 encoding, unicode, size, startpos, endpos, reason);
2566 if (*exceptionObject != NULL)
2567 PyCodec_StrictErrors(*exceptionObject);
2568}
2569
2570/* error handling callback helper:
2571 build arguments, call the callback and check the arguments,
2572 put the result into newpos and return the replacement string, which
2573 has to be freed by the caller */
2574static PyObject *unicode_encode_call_errorhandler(const char *errors,
2575 PyObject **errorHandler,
2576 const char *encoding, const char *reason,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002577 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
2578 Py_ssize_t startpos, Py_ssize_t endpos,
2579 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002580{
Martin v. Löwis18e16552006-02-15 17:27:45 +00002581 static char *argparse = "O!n;encoding error handler must return (unicode, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002582
2583 PyObject *restuple;
2584 PyObject *resunicode;
2585
2586 if (*errorHandler == NULL) {
2587 *errorHandler = PyCodec_LookupError(errors);
2588 if (*errorHandler == NULL)
2589 return NULL;
2590 }
2591
2592 make_encode_exception(exceptionObject,
2593 encoding, unicode, size, startpos, endpos, reason);
2594 if (*exceptionObject == NULL)
2595 return NULL;
2596
2597 restuple = PyObject_CallFunctionObjArgs(
2598 *errorHandler, *exceptionObject, NULL);
2599 if (restuple == NULL)
2600 return NULL;
2601 if (!PyTuple_Check(restuple)) {
2602 PyErr_Format(PyExc_TypeError, &argparse[4]);
2603 Py_DECREF(restuple);
2604 return NULL;
2605 }
2606 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
2607 &resunicode, newpos)) {
2608 Py_DECREF(restuple);
2609 return NULL;
2610 }
2611 if (*newpos<0)
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00002612 *newpos = size+*newpos;
2613 if (*newpos<0 || *newpos>size) {
Martin v. Löwis2c95cc62006-02-16 06:54:25 +00002614 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00002615 Py_DECREF(restuple);
2616 return NULL;
2617 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002618 Py_INCREF(resunicode);
2619 Py_DECREF(restuple);
2620 return resunicode;
2621}
2622
2623static PyObject *unicode_encode_ucs1(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002624 Py_ssize_t size,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002625 const char *errors,
2626 int limit)
2627{
2628 /* output object */
2629 PyObject *res;
2630 /* pointers to the beginning and end+1 of input */
2631 const Py_UNICODE *startp = p;
2632 const Py_UNICODE *endp = p + size;
2633 /* pointer to the beginning of the unencodable characters */
2634 /* const Py_UNICODE *badp = NULL; */
2635 /* pointer into the output */
2636 char *str;
2637 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00002638 Py_ssize_t respos = 0;
2639 Py_ssize_t ressize;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002640 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
2641 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002642 PyObject *errorHandler = NULL;
2643 PyObject *exc = NULL;
2644 /* the following variable is used for caching string comparisons
2645 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
2646 int known_errorHandler = -1;
2647
2648 /* allocate enough for a simple encoding without
2649 replacements, if we need more, we'll resize */
Guido van Rossumf15a29f2007-05-04 00:41:39 +00002650 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002651 if (res == NULL)
2652 goto onError;
2653 if (size == 0)
2654 return res;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00002655 str = PyBytes_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002656 ressize = size;
2657
2658 while (p<endp) {
2659 Py_UNICODE c = *p;
2660
2661 /* can we encode this? */
2662 if (c<limit) {
2663 /* no overflow check, because we know that the space is enough */
2664 *str++ = (char)c;
2665 ++p;
2666 }
2667 else {
Martin v. Löwis18e16552006-02-15 17:27:45 +00002668 Py_ssize_t unicodepos = p-startp;
2669 Py_ssize_t requiredsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002670 PyObject *repunicode;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002671 Py_ssize_t repsize;
2672 Py_ssize_t newpos;
2673 Py_ssize_t respos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002674 Py_UNICODE *uni2;
2675 /* startpos for collecting unencodable chars */
2676 const Py_UNICODE *collstart = p;
2677 const Py_UNICODE *collend = p;
2678 /* find all unecodable characters */
2679 while ((collend < endp) && ((*collend)>=limit))
2680 ++collend;
2681 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
2682 if (known_errorHandler==-1) {
2683 if ((errors==NULL) || (!strcmp(errors, "strict")))
2684 known_errorHandler = 1;
2685 else if (!strcmp(errors, "replace"))
2686 known_errorHandler = 2;
2687 else if (!strcmp(errors, "ignore"))
2688 known_errorHandler = 3;
2689 else if (!strcmp(errors, "xmlcharrefreplace"))
2690 known_errorHandler = 4;
2691 else
2692 known_errorHandler = 0;
2693 }
2694 switch (known_errorHandler) {
2695 case 1: /* strict */
2696 raise_encode_exception(&exc, encoding, startp, size, collstart-startp, collend-startp, reason);
2697 goto onError;
2698 case 2: /* replace */
2699 while (collstart++<collend)
2700 *str++ = '?'; /* fall through */
2701 case 3: /* ignore */
2702 p = collend;
2703 break;
2704 case 4: /* xmlcharrefreplace */
Guido van Rossumf15a29f2007-05-04 00:41:39 +00002705 respos = str - PyBytes_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002706 /* determine replacement size (temporarily (mis)uses p) */
2707 for (p = collstart, repsize = 0; p < collend; ++p) {
2708 if (*p<10)
2709 repsize += 2+1+1;
2710 else if (*p<100)
2711 repsize += 2+2+1;
2712 else if (*p<1000)
2713 repsize += 2+3+1;
2714 else if (*p<10000)
2715 repsize += 2+4+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00002716#ifndef Py_UNICODE_WIDE
2717 else
2718 repsize += 2+5+1;
2719#else
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002720 else if (*p<100000)
2721 repsize += 2+5+1;
2722 else if (*p<1000000)
2723 repsize += 2+6+1;
2724 else
2725 repsize += 2+7+1;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002726#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002727 }
2728 requiredsize = respos+repsize+(endp-collend);
2729 if (requiredsize > ressize) {
2730 if (requiredsize<2*ressize)
2731 requiredsize = 2*ressize;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00002732 if (PyBytes_Resize(res, requiredsize))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002733 goto onError;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00002734 str = PyBytes_AS_STRING(res) + respos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002735 ressize = requiredsize;
2736 }
2737 /* generate replacement (temporarily (mis)uses p) */
2738 for (p = collstart; p < collend; ++p) {
2739 str += sprintf(str, "&#%d;", (int)*p);
2740 }
2741 p = collend;
2742 break;
2743 default:
2744 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
2745 encoding, reason, startp, size, &exc,
2746 collstart-startp, collend-startp, &newpos);
2747 if (repunicode == NULL)
2748 goto onError;
2749 /* need more space? (at least enough for what we
2750 have+the replacement+the rest of the string, so
2751 we won't have to check space for encodable characters) */
Guido van Rossumf15a29f2007-05-04 00:41:39 +00002752 respos = str - PyBytes_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002753 repsize = PyUnicode_GET_SIZE(repunicode);
2754 requiredsize = respos+repsize+(endp-collend);
2755 if (requiredsize > ressize) {
2756 if (requiredsize<2*ressize)
2757 requiredsize = 2*ressize;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00002758 if (PyBytes_Resize(res, requiredsize)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002759 Py_DECREF(repunicode);
2760 goto onError;
2761 }
Guido van Rossumf15a29f2007-05-04 00:41:39 +00002762 str = PyBytes_AS_STRING(res) + respos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002763 ressize = requiredsize;
2764 }
2765 /* check if there is anything unencodable in the replacement
2766 and copy it to the output */
2767 for (uni2 = PyUnicode_AS_UNICODE(repunicode);repsize-->0; ++uni2, ++str) {
2768 c = *uni2;
2769 if (c >= limit) {
2770 raise_encode_exception(&exc, encoding, startp, size,
2771 unicodepos, unicodepos+1, reason);
2772 Py_DECREF(repunicode);
2773 goto onError;
2774 }
2775 *str = (char)c;
2776 }
2777 p = startp + newpos;
2778 Py_DECREF(repunicode);
2779 }
2780 }
2781 }
2782 /* Resize if we allocated to much */
Guido van Rossumf15a29f2007-05-04 00:41:39 +00002783 respos = str - PyBytes_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002784 if (respos<ressize)
2785 /* If this falls res will be NULL */
Guido van Rossumf15a29f2007-05-04 00:41:39 +00002786 PyBytes_Resize(res, respos);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002787 Py_XDECREF(errorHandler);
2788 Py_XDECREF(exc);
2789 return res;
2790
2791 onError:
2792 Py_XDECREF(res);
2793 Py_XDECREF(errorHandler);
2794 Py_XDECREF(exc);
2795 return NULL;
2796}
2797
Guido van Rossumd57fd912000-03-10 22:53:23 +00002798PyObject *PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002799 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00002800 const char *errors)
2801{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002802 return unicode_encode_ucs1(p, size, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002803}
2804
2805PyObject *PyUnicode_AsLatin1String(PyObject *unicode)
2806{
2807 if (!PyUnicode_Check(unicode)) {
2808 PyErr_BadArgument();
2809 return NULL;
2810 }
2811 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
2812 PyUnicode_GET_SIZE(unicode),
2813 NULL);
2814}
2815
2816/* --- 7-bit ASCII Codec -------------------------------------------------- */
2817
Guido van Rossumd57fd912000-03-10 22:53:23 +00002818PyObject *PyUnicode_DecodeASCII(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002819 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00002820 const char *errors)
2821{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002822 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002823 PyUnicodeObject *v;
2824 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002825 Py_ssize_t startinpos;
2826 Py_ssize_t endinpos;
2827 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002828 const char *e;
2829 PyObject *errorHandler = NULL;
2830 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00002831
Guido van Rossumd57fd912000-03-10 22:53:23 +00002832 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002833 if (size == 1 && *(unsigned char*)s < 128) {
2834 Py_UNICODE r = *(unsigned char*)s;
2835 return PyUnicode_FromUnicode(&r, 1);
2836 }
Tim Petersced69f82003-09-16 20:30:58 +00002837
Guido van Rossumd57fd912000-03-10 22:53:23 +00002838 v = _PyUnicode_New(size);
2839 if (v == NULL)
2840 goto onError;
2841 if (size == 0)
2842 return (PyObject *)v;
2843 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002844 e = s + size;
2845 while (s < e) {
2846 register unsigned char c = (unsigned char)*s;
2847 if (c < 128) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002848 *p++ = c;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002849 ++s;
2850 }
2851 else {
2852 startinpos = s-starts;
2853 endinpos = startinpos + 1;
Jeremy Hyltond8082792003-09-16 19:41:39 +00002854 outpos = p - (Py_UNICODE *)PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002855 if (unicode_decode_call_errorhandler(
2856 errors, &errorHandler,
2857 "ascii", "ordinal not in range(128)",
2858 starts, size, &startinpos, &endinpos, &exc, &s,
2859 (PyObject **)&v, &outpos, &p))
Guido van Rossumd57fd912000-03-10 22:53:23 +00002860 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002861 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002862 }
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002863 if (p - PyUnicode_AS_UNICODE(v) < PyString_GET_SIZE(v))
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002864 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002865 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002866 Py_XDECREF(errorHandler);
2867 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002868 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00002869
Guido van Rossumd57fd912000-03-10 22:53:23 +00002870 onError:
2871 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002872 Py_XDECREF(errorHandler);
2873 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002874 return NULL;
2875}
2876
Guido van Rossumd57fd912000-03-10 22:53:23 +00002877PyObject *PyUnicode_EncodeASCII(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002878 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00002879 const char *errors)
2880{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002881 return unicode_encode_ucs1(p, size, errors, 128);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002882}
2883
2884PyObject *PyUnicode_AsASCIIString(PyObject *unicode)
2885{
2886 if (!PyUnicode_Check(unicode)) {
2887 PyErr_BadArgument();
2888 return NULL;
2889 }
2890 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
2891 PyUnicode_GET_SIZE(unicode),
2892 NULL);
2893}
2894
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00002895#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Guido van Rossum2ea3e142000-03-31 17:24:09 +00002896
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002897/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00002898
Thomas Wouters0e3f5912006-08-11 14:57:12 +00002899#if SIZEOF_INT < SIZEOF_SSIZE_T
2900#define NEED_RETRY
2901#endif
2902
2903/* XXX This code is limited to "true" double-byte encodings, as
2904 a) it assumes an incomplete character consists of a single byte, and
2905 b) IsDBCSLeadByte (probably) does not work for non-DBCS multi-byte
2906 encodings, see IsDBCSLeadByteEx documentation. */
2907
2908static int is_dbcs_lead_byte(const char *s, int offset)
2909{
2910 const char *curr = s + offset;
2911
2912 if (IsDBCSLeadByte(*curr)) {
2913 const char *prev = CharPrev(s, curr);
2914 return (prev == curr) || !IsDBCSLeadByte(*prev) || (curr - prev == 2);
2915 }
2916 return 0;
2917}
2918
2919/*
2920 * Decode MBCS string into unicode object. If 'final' is set, converts
2921 * trailing lead-byte too. Returns consumed size if succeed, -1 otherwise.
2922 */
2923static int decode_mbcs(PyUnicodeObject **v,
2924 const char *s, /* MBCS string */
2925 int size, /* sizeof MBCS string */
2926 int final)
2927{
2928 Py_UNICODE *p;
2929 Py_ssize_t n = 0;
2930 int usize = 0;
2931
2932 assert(size >= 0);
2933
2934 /* Skip trailing lead-byte unless 'final' is set */
2935 if (!final && size >= 1 && is_dbcs_lead_byte(s, size - 1))
2936 --size;
2937
2938 /* First get the size of the result */
2939 if (size > 0) {
2940 usize = MultiByteToWideChar(CP_ACP, 0, s, size, NULL, 0);
2941 if (usize == 0) {
2942 PyErr_SetFromWindowsErrWithFilename(0, NULL);
2943 return -1;
2944 }
2945 }
2946
2947 if (*v == NULL) {
2948 /* Create unicode object */
2949 *v = _PyUnicode_New(usize);
2950 if (*v == NULL)
2951 return -1;
2952 }
2953 else {
2954 /* Extend unicode object */
2955 n = PyUnicode_GET_SIZE(*v);
2956 if (_PyUnicode_Resize(v, n + usize) < 0)
2957 return -1;
2958 }
2959
2960 /* Do the conversion */
2961 if (size > 0) {
2962 p = PyUnicode_AS_UNICODE(*v) + n;
2963 if (0 == MultiByteToWideChar(CP_ACP, 0, s, size, p, usize)) {
2964 PyErr_SetFromWindowsErrWithFilename(0, NULL);
2965 return -1;
2966 }
2967 }
2968
2969 return size;
2970}
2971
2972PyObject *PyUnicode_DecodeMBCSStateful(const char *s,
2973 Py_ssize_t size,
2974 const char *errors,
2975 Py_ssize_t *consumed)
2976{
2977 PyUnicodeObject *v = NULL;
2978 int done;
2979
2980 if (consumed)
2981 *consumed = 0;
2982
2983#ifdef NEED_RETRY
2984 retry:
2985 if (size > INT_MAX)
2986 done = decode_mbcs(&v, s, INT_MAX, 0);
2987 else
2988#endif
2989 done = decode_mbcs(&v, s, (int)size, !consumed);
2990
2991 if (done < 0) {
2992 Py_XDECREF(v);
2993 return NULL;
2994 }
2995
2996 if (consumed)
2997 *consumed += done;
2998
2999#ifdef NEED_RETRY
3000 if (size > INT_MAX) {
3001 s += done;
3002 size -= done;
3003 goto retry;
3004 }
3005#endif
3006
3007 return (PyObject *)v;
3008}
3009
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003010PyObject *PyUnicode_DecodeMBCS(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003011 Py_ssize_t size,
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003012 const char *errors)
3013{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003014 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
3015}
3016
3017/*
3018 * Convert unicode into string object (MBCS).
3019 * Returns 0 if succeed, -1 otherwise.
3020 */
3021static int encode_mbcs(PyObject **repr,
3022 const Py_UNICODE *p, /* unicode */
3023 int size) /* size of unicode */
3024{
3025 int mbcssize = 0;
3026 Py_ssize_t n = 0;
3027
3028 assert(size >= 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003029
3030 /* First get the size of the result */
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003031 if (size > 0) {
3032 mbcssize = WideCharToMultiByte(CP_ACP, 0, p, size, NULL, 0, NULL, NULL);
3033 if (mbcssize == 0) {
3034 PyErr_SetFromWindowsErrWithFilename(0, NULL);
3035 return -1;
3036 }
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003037 }
3038
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003039 if (*repr == NULL) {
3040 /* Create string object */
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003041 *repr = PyBytes_FromStringAndSize(NULL, mbcssize);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003042 if (*repr == NULL)
3043 return -1;
3044 }
3045 else {
3046 /* Extend string object */
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003047 n = PyBytes_Size(*repr);
3048 if (PyBytes_Resize(*repr, n + mbcssize) < 0)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003049 return -1;
3050 }
3051
3052 /* Do the conversion */
3053 if (size > 0) {
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003054 char *s = PyBytes_AS_STRING(*repr) + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003055 if (0 == WideCharToMultiByte(CP_ACP, 0, p, size, s, mbcssize, NULL, NULL)) {
3056 PyErr_SetFromWindowsErrWithFilename(0, NULL);
3057 return -1;
3058 }
3059 }
3060
3061 return 0;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003062}
3063
3064PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003065 Py_ssize_t size,
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003066 const char *errors)
3067{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003068 PyObject *repr = NULL;
3069 int ret;
Guido van Rossum03e29f12000-05-04 15:52:20 +00003070
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003071#ifdef NEED_RETRY
3072 retry:
3073 if (size > INT_MAX)
3074 ret = encode_mbcs(&repr, p, INT_MAX);
3075 else
3076#endif
3077 ret = encode_mbcs(&repr, p, (int)size);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003078
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003079 if (ret < 0) {
3080 Py_XDECREF(repr);
3081 return NULL;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003082 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003083
3084#ifdef NEED_RETRY
3085 if (size > INT_MAX) {
3086 p += INT_MAX;
3087 size -= INT_MAX;
3088 goto retry;
3089 }
3090#endif
3091
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003092 return repr;
3093}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00003094
Mark Hammond0ccda1e2003-07-01 00:13:27 +00003095PyObject *PyUnicode_AsMBCSString(PyObject *unicode)
3096{
3097 if (!PyUnicode_Check(unicode)) {
3098 PyErr_BadArgument();
3099 return NULL;
3100 }
3101 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
3102 PyUnicode_GET_SIZE(unicode),
3103 NULL);
3104}
3105
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003106#undef NEED_RETRY
3107
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00003108#endif /* MS_WINDOWS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003109
Guido van Rossumd57fd912000-03-10 22:53:23 +00003110/* --- Character Mapping Codec -------------------------------------------- */
3111
Guido van Rossumd57fd912000-03-10 22:53:23 +00003112PyObject *PyUnicode_DecodeCharmap(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003113 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003114 PyObject *mapping,
3115 const char *errors)
3116{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003117 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003118 Py_ssize_t startinpos;
3119 Py_ssize_t endinpos;
3120 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003121 const char *e;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003122 PyUnicodeObject *v;
3123 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003124 Py_ssize_t extrachars = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003125 PyObject *errorHandler = NULL;
3126 PyObject *exc = NULL;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003127 Py_UNICODE *mapstring = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003128 Py_ssize_t maplen = 0;
Tim Petersced69f82003-09-16 20:30:58 +00003129
Guido van Rossumd57fd912000-03-10 22:53:23 +00003130 /* Default to Latin-1 */
3131 if (mapping == NULL)
3132 return PyUnicode_DecodeLatin1(s, size, errors);
3133
3134 v = _PyUnicode_New(size);
3135 if (v == NULL)
3136 goto onError;
3137 if (size == 0)
3138 return (PyObject *)v;
3139 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003140 e = s + size;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003141 if (PyUnicode_CheckExact(mapping)) {
3142 mapstring = PyUnicode_AS_UNICODE(mapping);
3143 maplen = PyUnicode_GET_SIZE(mapping);
3144 while (s < e) {
3145 unsigned char ch = *s;
3146 Py_UNICODE x = 0xfffe; /* illegal value */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003147
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003148 if (ch < maplen)
3149 x = mapstring[ch];
Guido van Rossumd57fd912000-03-10 22:53:23 +00003150
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003151 if (x == 0xfffe) {
3152 /* undefined mapping */
3153 outpos = p-PyUnicode_AS_UNICODE(v);
3154 startinpos = s-starts;
3155 endinpos = startinpos+1;
3156 if (unicode_decode_call_errorhandler(
3157 errors, &errorHandler,
3158 "charmap", "character maps to <undefined>",
3159 starts, size, &startinpos, &endinpos, &exc, &s,
3160 (PyObject **)&v, &outpos, &p)) {
3161 goto onError;
Marc-André Lemburgec233e52001-01-06 14:59:58 +00003162 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003163 continue;
Marc-André Lemburgec233e52001-01-06 14:59:58 +00003164 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003165 *p++ = x;
3166 ++s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003167 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003168 }
3169 else {
3170 while (s < e) {
3171 unsigned char ch = *s;
3172 PyObject *w, *x;
3173
3174 /* Get mapping (char ordinal -> integer, Unicode char or None) */
3175 w = PyInt_FromLong((long)ch);
3176 if (w == NULL)
3177 goto onError;
3178 x = PyObject_GetItem(mapping, w);
3179 Py_DECREF(w);
3180 if (x == NULL) {
3181 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
3182 /* No mapping found means: mapping is undefined. */
3183 PyErr_Clear();
3184 x = Py_None;
3185 Py_INCREF(x);
3186 } else
3187 goto onError;
3188 }
3189
3190 /* Apply mapping */
3191 if (PyInt_Check(x)) {
3192 long value = PyInt_AS_LONG(x);
3193 if (value < 0 || value > 65535) {
3194 PyErr_SetString(PyExc_TypeError,
3195 "character mapping must be in range(65536)");
3196 Py_DECREF(x);
3197 goto onError;
3198 }
3199 *p++ = (Py_UNICODE)value;
3200 }
3201 else if (x == Py_None) {
3202 /* undefined mapping */
3203 outpos = p-PyUnicode_AS_UNICODE(v);
3204 startinpos = s-starts;
3205 endinpos = startinpos+1;
3206 if (unicode_decode_call_errorhandler(
3207 errors, &errorHandler,
3208 "charmap", "character maps to <undefined>",
3209 starts, size, &startinpos, &endinpos, &exc, &s,
3210 (PyObject **)&v, &outpos, &p)) {
3211 Py_DECREF(x);
3212 goto onError;
3213 }
Walter Dörwaldd4fff172005-11-28 22:15:56 +00003214 Py_DECREF(x);
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003215 continue;
3216 }
3217 else if (PyUnicode_Check(x)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00003218 Py_ssize_t targetsize = PyUnicode_GET_SIZE(x);
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003219
3220 if (targetsize == 1)
3221 /* 1-1 mapping */
3222 *p++ = *PyUnicode_AS_UNICODE(x);
3223
3224 else if (targetsize > 1) {
3225 /* 1-n mapping */
3226 if (targetsize > extrachars) {
3227 /* resize first */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003228 Py_ssize_t oldpos = p - PyUnicode_AS_UNICODE(v);
3229 Py_ssize_t needed = (targetsize - extrachars) + \
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003230 (targetsize << 2);
3231 extrachars += needed;
Thomas Wouters89f507f2006-12-13 04:49:30 +00003232 /* XXX overflow detection missing */
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003233 if (_PyUnicode_Resize(&v,
3234 PyUnicode_GET_SIZE(v) + needed) < 0) {
3235 Py_DECREF(x);
3236 goto onError;
3237 }
3238 p = PyUnicode_AS_UNICODE(v) + oldpos;
3239 }
3240 Py_UNICODE_COPY(p,
3241 PyUnicode_AS_UNICODE(x),
3242 targetsize);
3243 p += targetsize;
3244 extrachars -= targetsize;
3245 }
3246 /* 1-0 mapping: skip the character */
3247 }
3248 else {
3249 /* wrong return value */
3250 PyErr_SetString(PyExc_TypeError,
3251 "character mapping must return integer, None or unicode");
3252 Py_DECREF(x);
3253 goto onError;
3254 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003255 Py_DECREF(x);
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003256 ++s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003257 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003258 }
3259 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003260 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003261 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003262 Py_XDECREF(errorHandler);
3263 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003264 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00003265
Guido van Rossumd57fd912000-03-10 22:53:23 +00003266 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003267 Py_XDECREF(errorHandler);
3268 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003269 Py_XDECREF(v);
3270 return NULL;
3271}
3272
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003273/* Charmap encoding: the lookup table */
3274
3275struct encoding_map{
3276 PyObject_HEAD
3277 unsigned char level1[32];
3278 int count2, count3;
3279 unsigned char level23[1];
3280};
3281
3282static PyObject*
3283encoding_map_size(PyObject *obj, PyObject* args)
3284{
3285 struct encoding_map *map = (struct encoding_map*)obj;
3286 return PyInt_FromLong(sizeof(*map) - 1 + 16*map->count2 +
3287 128*map->count3);
3288}
3289
3290static PyMethodDef encoding_map_methods[] = {
3291 {"size", encoding_map_size, METH_NOARGS,
3292 PyDoc_STR("Return the size (in bytes) of this object") },
3293 { 0 }
3294};
3295
3296static void
3297encoding_map_dealloc(PyObject* o)
3298{
3299 PyObject_FREE(o);
3300}
3301
3302static PyTypeObject EncodingMapType = {
3303 PyObject_HEAD_INIT(NULL)
3304 0, /*ob_size*/
3305 "EncodingMap", /*tp_name*/
3306 sizeof(struct encoding_map), /*tp_basicsize*/
3307 0, /*tp_itemsize*/
3308 /* methods */
3309 encoding_map_dealloc, /*tp_dealloc*/
3310 0, /*tp_print*/
3311 0, /*tp_getattr*/
3312 0, /*tp_setattr*/
3313 0, /*tp_compare*/
3314 0, /*tp_repr*/
3315 0, /*tp_as_number*/
3316 0, /*tp_as_sequence*/
3317 0, /*tp_as_mapping*/
3318 0, /*tp_hash*/
3319 0, /*tp_call*/
3320 0, /*tp_str*/
3321 0, /*tp_getattro*/
3322 0, /*tp_setattro*/
3323 0, /*tp_as_buffer*/
3324 Py_TPFLAGS_DEFAULT, /*tp_flags*/
3325 0, /*tp_doc*/
3326 0, /*tp_traverse*/
3327 0, /*tp_clear*/
3328 0, /*tp_richcompare*/
3329 0, /*tp_weaklistoffset*/
3330 0, /*tp_iter*/
3331 0, /*tp_iternext*/
3332 encoding_map_methods, /*tp_methods*/
3333 0, /*tp_members*/
3334 0, /*tp_getset*/
3335 0, /*tp_base*/
3336 0, /*tp_dict*/
3337 0, /*tp_descr_get*/
3338 0, /*tp_descr_set*/
3339 0, /*tp_dictoffset*/
3340 0, /*tp_init*/
3341 0, /*tp_alloc*/
3342 0, /*tp_new*/
3343 0, /*tp_free*/
3344 0, /*tp_is_gc*/
3345};
3346
3347PyObject*
3348PyUnicode_BuildEncodingMap(PyObject* string)
3349{
3350 Py_UNICODE *decode;
3351 PyObject *result;
3352 struct encoding_map *mresult;
3353 int i;
3354 int need_dict = 0;
3355 unsigned char level1[32];
3356 unsigned char level2[512];
3357 unsigned char *mlevel1, *mlevel2, *mlevel3;
3358 int count2 = 0, count3 = 0;
3359
3360 if (!PyUnicode_Check(string) || PyUnicode_GetSize(string) != 256) {
3361 PyErr_BadArgument();
3362 return NULL;
3363 }
3364 decode = PyUnicode_AS_UNICODE(string);
3365 memset(level1, 0xFF, sizeof level1);
3366 memset(level2, 0xFF, sizeof level2);
3367
3368 /* If there isn't a one-to-one mapping of NULL to \0,
3369 or if there are non-BMP characters, we need to use
3370 a mapping dictionary. */
3371 if (decode[0] != 0)
3372 need_dict = 1;
3373 for (i = 1; i < 256; i++) {
3374 int l1, l2;
3375 if (decode[i] == 0
3376 #ifdef Py_UNICODE_WIDE
3377 || decode[i] > 0xFFFF
3378 #endif
3379 ) {
3380 need_dict = 1;
3381 break;
3382 }
3383 if (decode[i] == 0xFFFE)
3384 /* unmapped character */
3385 continue;
3386 l1 = decode[i] >> 11;
3387 l2 = decode[i] >> 7;
3388 if (level1[l1] == 0xFF)
3389 level1[l1] = count2++;
3390 if (level2[l2] == 0xFF)
3391 level2[l2] = count3++;
3392 }
3393
3394 if (count2 >= 0xFF || count3 >= 0xFF)
3395 need_dict = 1;
3396
3397 if (need_dict) {
3398 PyObject *result = PyDict_New();
3399 PyObject *key, *value;
3400 if (!result)
3401 return NULL;
3402 for (i = 0; i < 256; i++) {
3403 key = value = NULL;
3404 key = PyInt_FromLong(decode[i]);
3405 value = PyInt_FromLong(i);
3406 if (!key || !value)
3407 goto failed1;
3408 if (PyDict_SetItem(result, key, value) == -1)
3409 goto failed1;
3410 Py_DECREF(key);
3411 Py_DECREF(value);
3412 }
3413 return result;
3414 failed1:
3415 Py_XDECREF(key);
3416 Py_XDECREF(value);
3417 Py_DECREF(result);
3418 return NULL;
3419 }
3420
3421 /* Create a three-level trie */
3422 result = PyObject_MALLOC(sizeof(struct encoding_map) +
3423 16*count2 + 128*count3 - 1);
3424 if (!result)
3425 return PyErr_NoMemory();
3426 PyObject_Init(result, &EncodingMapType);
3427 mresult = (struct encoding_map*)result;
3428 mresult->count2 = count2;
3429 mresult->count3 = count3;
3430 mlevel1 = mresult->level1;
3431 mlevel2 = mresult->level23;
3432 mlevel3 = mresult->level23 + 16*count2;
3433 memcpy(mlevel1, level1, 32);
3434 memset(mlevel2, 0xFF, 16*count2);
3435 memset(mlevel3, 0, 128*count3);
3436 count3 = 0;
3437 for (i = 1; i < 256; i++) {
3438 int o1, o2, o3, i2, i3;
3439 if (decode[i] == 0xFFFE)
3440 /* unmapped character */
3441 continue;
3442 o1 = decode[i]>>11;
3443 o2 = (decode[i]>>7) & 0xF;
3444 i2 = 16*mlevel1[o1] + o2;
3445 if (mlevel2[i2] == 0xFF)
3446 mlevel2[i2] = count3++;
3447 o3 = decode[i] & 0x7F;
3448 i3 = 128*mlevel2[i2] + o3;
3449 mlevel3[i3] = i;
3450 }
3451 return result;
3452}
3453
3454static int
3455encoding_map_lookup(Py_UNICODE c, PyObject *mapping)
3456{
3457 struct encoding_map *map = (struct encoding_map*)mapping;
3458 int l1 = c>>11;
3459 int l2 = (c>>7) & 0xF;
3460 int l3 = c & 0x7F;
3461 int i;
3462
3463#ifdef Py_UNICODE_WIDE
3464 if (c > 0xFFFF) {
3465 return -1;
3466 }
3467#endif
3468 if (c == 0)
3469 return 0;
3470 /* level 1*/
3471 i = map->level1[l1];
3472 if (i == 0xFF) {
3473 return -1;
3474 }
3475 /* level 2*/
3476 i = map->level23[16*i+l2];
3477 if (i == 0xFF) {
3478 return -1;
3479 }
3480 /* level 3 */
3481 i = map->level23[16*map->count2 + 128*i + l3];
3482 if (i == 0) {
3483 return -1;
3484 }
3485 return i;
3486}
3487
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003488/* Lookup the character ch in the mapping. If the character
3489 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00003490 error occurred). */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003491static PyObject *charmapencode_lookup(Py_UNICODE c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003492{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003493 PyObject *w = PyInt_FromLong((long)c);
3494 PyObject *x;
3495
3496 if (w == NULL)
3497 return NULL;
3498 x = PyObject_GetItem(mapping, w);
3499 Py_DECREF(w);
3500 if (x == NULL) {
3501 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
3502 /* No mapping found means: mapping is undefined. */
3503 PyErr_Clear();
3504 x = Py_None;
3505 Py_INCREF(x);
3506 return x;
3507 } else
3508 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003509 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00003510 else if (x == Py_None)
3511 return x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003512 else if (PyInt_Check(x)) {
3513 long value = PyInt_AS_LONG(x);
3514 if (value < 0 || value > 255) {
3515 PyErr_SetString(PyExc_TypeError,
3516 "character mapping must be in range(256)");
3517 Py_DECREF(x);
3518 return NULL;
3519 }
3520 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003521 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003522 else if (PyString_Check(x))
3523 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003524 else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003525 /* wrong return value */
Walter Dörwald580ceed2007-05-09 10:39:19 +00003526 PyErr_Format(PyExc_TypeError,
3527 "character mapping must return integer, None or str8, not %.400s",
3528 x->ob_type->tp_name);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003529 Py_DECREF(x);
3530 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003531 }
3532}
3533
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003534static int
3535charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
3536{
3537 Py_ssize_t outsize = PyString_GET_SIZE(*outobj);
3538 /* exponentially overallocate to minimize reallocations */
3539 if (requiredsize < 2*outsize)
3540 requiredsize = 2*outsize;
3541 if (_PyString_Resize(outobj, requiredsize)) {
3542 return 0;
3543 }
3544 return 1;
3545}
3546
3547typedef enum charmapencode_result {
3548 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
3549}charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003550/* lookup the character, put the result in the output string and adjust
3551 various state variables. Reallocate the output string if not enough
3552 space is available. Return a new reference to the object that
3553 was put in the output buffer, or Py_None, if the mapping was undefined
3554 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00003555 reallocation error occurred. The caller must decref the result */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003556static
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003557charmapencode_result charmapencode_output(Py_UNICODE c, PyObject *mapping,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003558 PyObject **outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003559{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003560 PyObject *rep;
3561 char *outstart;
3562 Py_ssize_t outsize = PyString_GET_SIZE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003563
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003564 if (mapping->ob_type == &EncodingMapType) {
3565 int res = encoding_map_lookup(c, mapping);
3566 Py_ssize_t requiredsize = *outpos+1;
3567 if (res == -1)
3568 return enc_FAILED;
3569 if (outsize<requiredsize)
3570 if (!charmapencode_resize(outobj, outpos, requiredsize))
3571 return enc_EXCEPTION;
3572 outstart = PyString_AS_STRING(*outobj);
3573 outstart[(*outpos)++] = (char)res;
3574 return enc_SUCCESS;
3575 }
3576
3577 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003578 if (rep==NULL)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003579 return enc_EXCEPTION;
3580 else if (rep==Py_None) {
3581 Py_DECREF(rep);
3582 return enc_FAILED;
3583 } else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003584 if (PyInt_Check(rep)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00003585 Py_ssize_t requiredsize = *outpos+1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003586 if (outsize<requiredsize)
3587 if (!charmapencode_resize(outobj, outpos, requiredsize)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003588 Py_DECREF(rep);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003589 return enc_EXCEPTION;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003590 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003591 outstart = PyString_AS_STRING(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003592 outstart[(*outpos)++] = (char)PyInt_AS_LONG(rep);
3593 }
3594 else {
3595 const char *repchars = PyString_AS_STRING(rep);
Martin v. Löwis18e16552006-02-15 17:27:45 +00003596 Py_ssize_t repsize = PyString_GET_SIZE(rep);
3597 Py_ssize_t requiredsize = *outpos+repsize;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003598 if (outsize<requiredsize)
3599 if (!charmapencode_resize(outobj, outpos, requiredsize)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003600 Py_DECREF(rep);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003601 return enc_EXCEPTION;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003602 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003603 outstart = PyString_AS_STRING(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003604 memcpy(outstart + *outpos, repchars, repsize);
3605 *outpos += repsize;
3606 }
3607 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003608 Py_DECREF(rep);
3609 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003610}
3611
3612/* handle an error in PyUnicode_EncodeCharmap
3613 Return 0 on success, -1 on error */
3614static
3615int charmap_encoding_error(
Martin v. Löwis18e16552006-02-15 17:27:45 +00003616 const Py_UNICODE *p, Py_ssize_t size, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003617 PyObject **exceptionObject,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00003618 int *known_errorHandler, PyObject **errorHandler, const char *errors,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003619 PyObject **res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003620{
3621 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003622 Py_ssize_t repsize;
3623 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003624 Py_UNICODE *uni2;
3625 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003626 Py_ssize_t collstartpos = *inpos;
3627 Py_ssize_t collendpos = *inpos+1;
3628 Py_ssize_t collpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003629 char *encoding = "charmap";
3630 char *reason = "character maps to <undefined>";
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003631 charmapencode_result x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003632
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003633 /* find all unencodable characters */
3634 while (collendpos < size) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003635 PyObject *rep;
3636 if (mapping->ob_type == &EncodingMapType) {
3637 int res = encoding_map_lookup(p[collendpos], mapping);
3638 if (res != -1)
3639 break;
3640 ++collendpos;
3641 continue;
3642 }
3643
3644 rep = charmapencode_lookup(p[collendpos], mapping);
3645 if (rep==NULL)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003646 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003647 else if (rep!=Py_None) {
3648 Py_DECREF(rep);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003649 break;
3650 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003651 Py_DECREF(rep);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003652 ++collendpos;
3653 }
3654 /* cache callback name lookup
3655 * (if not done yet, i.e. it's the first error) */
3656 if (*known_errorHandler==-1) {
3657 if ((errors==NULL) || (!strcmp(errors, "strict")))
3658 *known_errorHandler = 1;
3659 else if (!strcmp(errors, "replace"))
3660 *known_errorHandler = 2;
3661 else if (!strcmp(errors, "ignore"))
3662 *known_errorHandler = 3;
3663 else if (!strcmp(errors, "xmlcharrefreplace"))
3664 *known_errorHandler = 4;
3665 else
3666 *known_errorHandler = 0;
3667 }
3668 switch (*known_errorHandler) {
3669 case 1: /* strict */
3670 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
3671 return -1;
3672 case 2: /* replace */
3673 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
3674 x = charmapencode_output('?', mapping, res, respos);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003675 if (x==enc_EXCEPTION) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003676 return -1;
3677 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003678 else if (x==enc_FAILED) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003679 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
3680 return -1;
3681 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003682 }
3683 /* fall through */
3684 case 3: /* ignore */
3685 *inpos = collendpos;
3686 break;
3687 case 4: /* xmlcharrefreplace */
3688 /* generate replacement (temporarily (mis)uses p) */
3689 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
3690 char buffer[2+29+1+1];
3691 char *cp;
3692 sprintf(buffer, "&#%d;", (int)p[collpos]);
3693 for (cp = buffer; *cp; ++cp) {
3694 x = charmapencode_output(*cp, mapping, res, respos);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003695 if (x==enc_EXCEPTION)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003696 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003697 else if (x==enc_FAILED) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003698 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
3699 return -1;
3700 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003701 }
3702 }
3703 *inpos = collendpos;
3704 break;
3705 default:
Walter Dörwalde5402fb2003-08-14 20:25:29 +00003706 repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003707 encoding, reason, p, size, exceptionObject,
3708 collstartpos, collendpos, &newpos);
3709 if (repunicode == NULL)
3710 return -1;
3711 /* generate replacement */
3712 repsize = PyUnicode_GET_SIZE(repunicode);
3713 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
3714 x = charmapencode_output(*uni2, mapping, res, respos);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003715 if (x==enc_EXCEPTION) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003716 return -1;
3717 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003718 else if (x==enc_FAILED) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003719 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003720 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
3721 return -1;
3722 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003723 }
3724 *inpos = newpos;
3725 Py_DECREF(repunicode);
3726 }
3727 return 0;
3728}
3729
Guido van Rossumd57fd912000-03-10 22:53:23 +00003730PyObject *PyUnicode_EncodeCharmap(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003731 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003732 PyObject *mapping,
3733 const char *errors)
3734{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003735 /* output object */
3736 PyObject *res = NULL;
3737 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003738 Py_ssize_t inpos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003739 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003740 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003741 PyObject *errorHandler = NULL;
3742 PyObject *exc = NULL;
3743 /* the following variable is used for caching string comparisons
3744 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
3745 * 3=ignore, 4=xmlcharrefreplace */
3746 int known_errorHandler = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003747
3748 /* Default to Latin-1 */
3749 if (mapping == NULL)
3750 return PyUnicode_EncodeLatin1(p, size, errors);
3751
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003752 /* allocate enough for a simple encoding without
3753 replacements, if we need more, we'll resize */
3754 res = PyString_FromStringAndSize(NULL, size);
3755 if (res == NULL)
3756 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00003757 if (size == 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003758 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003759
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003760 while (inpos<size) {
3761 /* try to encode it */
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003762 charmapencode_result x = charmapencode_output(p[inpos], mapping, &res, &respos);
3763 if (x==enc_EXCEPTION) /* error */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003764 goto onError;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003765 if (x==enc_FAILED) { /* unencodable character */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003766 if (charmap_encoding_error(p, size, &inpos, mapping,
3767 &exc,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00003768 &known_errorHandler, &errorHandler, errors,
Walter Dörwald9b30f202003-08-15 16:26:34 +00003769 &res, &respos)) {
Marc-André Lemburg3a645e42001-01-16 11:54:12 +00003770 goto onError;
Walter Dörwald9b30f202003-08-15 16:26:34 +00003771 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003772 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003773 else
3774 /* done with this character => adjust input position */
3775 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003776 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003777
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003778 /* Resize if we allocated to much */
3779 if (respos<PyString_GET_SIZE(res)) {
3780 if (_PyString_Resize(&res, respos))
3781 goto onError;
3782 }
3783 Py_XDECREF(exc);
3784 Py_XDECREF(errorHandler);
3785 return res;
3786
3787 onError:
3788 Py_XDECREF(res);
3789 Py_XDECREF(exc);
3790 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003791 return NULL;
3792}
3793
3794PyObject *PyUnicode_AsCharmapString(PyObject *unicode,
3795 PyObject *mapping)
3796{
3797 if (!PyUnicode_Check(unicode) || mapping == NULL) {
3798 PyErr_BadArgument();
3799 return NULL;
3800 }
3801 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
3802 PyUnicode_GET_SIZE(unicode),
3803 mapping,
3804 NULL);
3805}
3806
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003807/* create or adjust a UnicodeTranslateError */
3808static void make_translate_exception(PyObject **exceptionObject,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003809 const Py_UNICODE *unicode, Py_ssize_t size,
3810 Py_ssize_t startpos, Py_ssize_t endpos,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003811 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003812{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003813 if (*exceptionObject == NULL) {
3814 *exceptionObject = PyUnicodeTranslateError_Create(
3815 unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003816 }
3817 else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003818 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
3819 goto onError;
3820 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
3821 goto onError;
3822 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
3823 goto onError;
3824 return;
3825 onError:
3826 Py_DECREF(*exceptionObject);
3827 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003828 }
3829}
3830
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003831/* raises a UnicodeTranslateError */
3832static void raise_translate_exception(PyObject **exceptionObject,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003833 const Py_UNICODE *unicode, Py_ssize_t size,
3834 Py_ssize_t startpos, Py_ssize_t endpos,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003835 const char *reason)
3836{
3837 make_translate_exception(exceptionObject,
3838 unicode, size, startpos, endpos, reason);
3839 if (*exceptionObject != NULL)
3840 PyCodec_StrictErrors(*exceptionObject);
3841}
3842
3843/* error handling callback helper:
3844 build arguments, call the callback and check the arguments,
3845 put the result into newpos and return the replacement string, which
3846 has to be freed by the caller */
3847static PyObject *unicode_translate_call_errorhandler(const char *errors,
3848 PyObject **errorHandler,
3849 const char *reason,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003850 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
3851 Py_ssize_t startpos, Py_ssize_t endpos,
3852 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003853{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003854 static char *argparse = "O!n;translating error handler must return (unicode, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003855
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003856 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003857 PyObject *restuple;
3858 PyObject *resunicode;
3859
3860 if (*errorHandler == NULL) {
3861 *errorHandler = PyCodec_LookupError(errors);
3862 if (*errorHandler == NULL)
3863 return NULL;
3864 }
3865
3866 make_translate_exception(exceptionObject,
3867 unicode, size, startpos, endpos, reason);
3868 if (*exceptionObject == NULL)
3869 return NULL;
3870
3871 restuple = PyObject_CallFunctionObjArgs(
3872 *errorHandler, *exceptionObject, NULL);
3873 if (restuple == NULL)
3874 return NULL;
3875 if (!PyTuple_Check(restuple)) {
3876 PyErr_Format(PyExc_TypeError, &argparse[4]);
3877 Py_DECREF(restuple);
3878 return NULL;
3879 }
3880 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003881 &resunicode, &i_newpos)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003882 Py_DECREF(restuple);
3883 return NULL;
3884 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00003885 if (i_newpos<0)
3886 *newpos = size+i_newpos;
3887 else
3888 *newpos = i_newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003889 if (*newpos<0 || *newpos>size) {
Martin v. Löwis2c95cc62006-02-16 06:54:25 +00003890 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003891 Py_DECREF(restuple);
3892 return NULL;
3893 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003894 Py_INCREF(resunicode);
3895 Py_DECREF(restuple);
3896 return resunicode;
3897}
3898
3899/* Lookup the character ch in the mapping and put the result in result,
3900 which must be decrefed by the caller.
3901 Return 0 on success, -1 on error */
3902static
3903int charmaptranslate_lookup(Py_UNICODE c, PyObject *mapping, PyObject **result)
3904{
3905 PyObject *w = PyInt_FromLong((long)c);
3906 PyObject *x;
3907
3908 if (w == NULL)
3909 return -1;
3910 x = PyObject_GetItem(mapping, w);
3911 Py_DECREF(w);
3912 if (x == NULL) {
3913 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
3914 /* No mapping found means: use 1:1 mapping. */
3915 PyErr_Clear();
3916 *result = NULL;
3917 return 0;
3918 } else
3919 return -1;
3920 }
3921 else if (x == Py_None) {
3922 *result = x;
3923 return 0;
3924 }
3925 else if (PyInt_Check(x)) {
3926 long value = PyInt_AS_LONG(x);
3927 long max = PyUnicode_GetMax();
3928 if (value < 0 || value > max) {
3929 PyErr_Format(PyExc_TypeError,
3930 "character mapping must be in range(0x%lx)", max+1);
3931 Py_DECREF(x);
3932 return -1;
3933 }
3934 *result = x;
3935 return 0;
3936 }
3937 else if (PyUnicode_Check(x)) {
3938 *result = x;
3939 return 0;
3940 }
3941 else {
3942 /* wrong return value */
3943 PyErr_SetString(PyExc_TypeError,
3944 "character mapping must return integer, None or unicode");
Walter Dörwald150523e2003-08-15 16:52:19 +00003945 Py_DECREF(x);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003946 return -1;
3947 }
3948}
3949/* ensure that *outobj is at least requiredsize characters long,
3950if not reallocate and adjust various state variables.
3951Return 0 on success, -1 on error */
3952static
Walter Dörwald4894c302003-10-24 14:25:28 +00003953int charmaptranslate_makespace(PyObject **outobj, Py_UNICODE **outp,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003954 Py_ssize_t requiredsize)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003955{
Martin v. Löwis18e16552006-02-15 17:27:45 +00003956 Py_ssize_t oldsize = PyUnicode_GET_SIZE(*outobj);
Walter Dörwald4894c302003-10-24 14:25:28 +00003957 if (requiredsize > oldsize) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003958 /* remember old output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003959 Py_ssize_t outpos = *outp-PyUnicode_AS_UNICODE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003960 /* exponentially overallocate to minimize reallocations */
Walter Dörwald4894c302003-10-24 14:25:28 +00003961 if (requiredsize < 2 * oldsize)
3962 requiredsize = 2 * oldsize;
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00003963 if (_PyUnicode_Resize(outobj, requiredsize) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003964 return -1;
3965 *outp = PyUnicode_AS_UNICODE(*outobj) + outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003966 }
3967 return 0;
3968}
3969/* lookup the character, put the result in the output string and adjust
3970 various state variables. Return a new reference to the object that
3971 was put in the output buffer in *result, or Py_None, if the mapping was
3972 undefined (in which case no character was written).
3973 The called must decref result.
3974 Return 0 on success, -1 on error. */
3975static
Walter Dörwald4894c302003-10-24 14:25:28 +00003976int charmaptranslate_output(const Py_UNICODE *startinp, const Py_UNICODE *curinp,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003977 Py_ssize_t insize, PyObject *mapping, PyObject **outobj, Py_UNICODE **outp,
Walter Dörwald4894c302003-10-24 14:25:28 +00003978 PyObject **res)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003979{
Walter Dörwald4894c302003-10-24 14:25:28 +00003980 if (charmaptranslate_lookup(*curinp, mapping, res))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003981 return -1;
3982 if (*res==NULL) {
3983 /* not found => default to 1:1 mapping */
Walter Dörwald4894c302003-10-24 14:25:28 +00003984 *(*outp)++ = *curinp;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003985 }
3986 else if (*res==Py_None)
3987 ;
3988 else if (PyInt_Check(*res)) {
3989 /* no overflow check, because we know that the space is enough */
3990 *(*outp)++ = (Py_UNICODE)PyInt_AS_LONG(*res);
3991 }
3992 else if (PyUnicode_Check(*res)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00003993 Py_ssize_t repsize = PyUnicode_GET_SIZE(*res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003994 if (repsize==1) {
3995 /* no overflow check, because we know that the space is enough */
3996 *(*outp)++ = *PyUnicode_AS_UNICODE(*res);
3997 }
3998 else if (repsize!=0) {
3999 /* more than one character */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004000 Py_ssize_t requiredsize = (*outp-PyUnicode_AS_UNICODE(*outobj)) +
Walter Dörwaldcd736e72004-02-05 17:36:00 +00004001 (insize - (curinp-startinp)) +
Walter Dörwald4894c302003-10-24 14:25:28 +00004002 repsize - 1;
4003 if (charmaptranslate_makespace(outobj, outp, requiredsize))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004004 return -1;
4005 memcpy(*outp, PyUnicode_AS_UNICODE(*res), sizeof(Py_UNICODE)*repsize);
4006 *outp += repsize;
4007 }
4008 }
4009 else
4010 return -1;
4011 return 0;
4012}
4013
4014PyObject *PyUnicode_TranslateCharmap(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004015 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004016 PyObject *mapping,
4017 const char *errors)
4018{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004019 /* output object */
4020 PyObject *res = NULL;
4021 /* pointers to the beginning and end+1 of input */
4022 const Py_UNICODE *startp = p;
4023 const Py_UNICODE *endp = p + size;
4024 /* pointer into the output */
4025 Py_UNICODE *str;
4026 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004027 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004028 char *reason = "character maps to <undefined>";
4029 PyObject *errorHandler = NULL;
4030 PyObject *exc = NULL;
4031 /* the following variable is used for caching string comparisons
4032 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
4033 * 3=ignore, 4=xmlcharrefreplace */
4034 int known_errorHandler = -1;
4035
Guido van Rossumd57fd912000-03-10 22:53:23 +00004036 if (mapping == NULL) {
4037 PyErr_BadArgument();
4038 return NULL;
4039 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004040
4041 /* allocate enough for a simple 1:1 translation without
4042 replacements, if we need more, we'll resize */
4043 res = PyUnicode_FromUnicode(NULL, size);
4044 if (res == NULL)
Walter Dörwald4894c302003-10-24 14:25:28 +00004045 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004046 if (size == 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004047 return res;
4048 str = PyUnicode_AS_UNICODE(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004049
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004050 while (p<endp) {
4051 /* try to encode it */
4052 PyObject *x = NULL;
Walter Dörwald4894c302003-10-24 14:25:28 +00004053 if (charmaptranslate_output(startp, p, size, mapping, &res, &str, &x)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004054 Py_XDECREF(x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004055 goto onError;
4056 }
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00004057 Py_XDECREF(x);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004058 if (x!=Py_None) /* it worked => adjust input pointer */
4059 ++p;
4060 else { /* untranslatable character */
4061 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004062 Py_ssize_t repsize;
4063 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004064 Py_UNICODE *uni2;
4065 /* startpos for collecting untranslatable chars */
4066 const Py_UNICODE *collstart = p;
4067 const Py_UNICODE *collend = p+1;
4068 const Py_UNICODE *coll;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004069
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004070 /* find all untranslatable characters */
4071 while (collend < endp) {
Walter Dörwald4894c302003-10-24 14:25:28 +00004072 if (charmaptranslate_lookup(*collend, mapping, &x))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004073 goto onError;
4074 Py_XDECREF(x);
4075 if (x!=Py_None)
4076 break;
4077 ++collend;
4078 }
4079 /* cache callback name lookup
4080 * (if not done yet, i.e. it's the first error) */
4081 if (known_errorHandler==-1) {
4082 if ((errors==NULL) || (!strcmp(errors, "strict")))
4083 known_errorHandler = 1;
4084 else if (!strcmp(errors, "replace"))
4085 known_errorHandler = 2;
4086 else if (!strcmp(errors, "ignore"))
4087 known_errorHandler = 3;
4088 else if (!strcmp(errors, "xmlcharrefreplace"))
4089 known_errorHandler = 4;
4090 else
4091 known_errorHandler = 0;
4092 }
4093 switch (known_errorHandler) {
4094 case 1: /* strict */
4095 raise_translate_exception(&exc, startp, size, collstart-startp, collend-startp, reason);
4096 goto onError;
4097 case 2: /* replace */
4098 /* No need to check for space, this is a 1:1 replacement */
4099 for (coll = collstart; coll<collend; ++coll)
4100 *str++ = '?';
4101 /* fall through */
4102 case 3: /* ignore */
4103 p = collend;
4104 break;
4105 case 4: /* xmlcharrefreplace */
4106 /* generate replacement (temporarily (mis)uses p) */
4107 for (p = collstart; p < collend; ++p) {
4108 char buffer[2+29+1+1];
4109 char *cp;
4110 sprintf(buffer, "&#%d;", (int)*p);
Walter Dörwald4894c302003-10-24 14:25:28 +00004111 if (charmaptranslate_makespace(&res, &str,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004112 (str-PyUnicode_AS_UNICODE(res))+strlen(buffer)+(endp-collend)))
4113 goto onError;
4114 for (cp = buffer; *cp; ++cp)
4115 *str++ = *cp;
4116 }
4117 p = collend;
4118 break;
4119 default:
4120 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
4121 reason, startp, size, &exc,
4122 collstart-startp, collend-startp, &newpos);
4123 if (repunicode == NULL)
4124 goto onError;
4125 /* generate replacement */
4126 repsize = PyUnicode_GET_SIZE(repunicode);
Walter Dörwald4894c302003-10-24 14:25:28 +00004127 if (charmaptranslate_makespace(&res, &str,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004128 (str-PyUnicode_AS_UNICODE(res))+repsize+(endp-collend))) {
4129 Py_DECREF(repunicode);
4130 goto onError;
4131 }
4132 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2)
4133 *str++ = *uni2;
4134 p = startp + newpos;
4135 Py_DECREF(repunicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004136 }
4137 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004138 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004139 /* Resize if we allocated to much */
4140 respos = str-PyUnicode_AS_UNICODE(res);
Walter Dörwald4894c302003-10-24 14:25:28 +00004141 if (respos<PyUnicode_GET_SIZE(res)) {
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00004142 if (_PyUnicode_Resize(&res, respos) < 0)
Guido van Rossumfd4b9572000-04-10 13:51:10 +00004143 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004144 }
4145 Py_XDECREF(exc);
4146 Py_XDECREF(errorHandler);
4147 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004148
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004149 onError:
4150 Py_XDECREF(res);
4151 Py_XDECREF(exc);
4152 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004153 return NULL;
4154}
4155
4156PyObject *PyUnicode_Translate(PyObject *str,
4157 PyObject *mapping,
4158 const char *errors)
4159{
4160 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00004161
Guido van Rossumd57fd912000-03-10 22:53:23 +00004162 str = PyUnicode_FromObject(str);
4163 if (str == NULL)
4164 goto onError;
4165 result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str),
4166 PyUnicode_GET_SIZE(str),
4167 mapping,
4168 errors);
4169 Py_DECREF(str);
4170 return result;
Tim Petersced69f82003-09-16 20:30:58 +00004171
Guido van Rossumd57fd912000-03-10 22:53:23 +00004172 onError:
4173 Py_XDECREF(str);
4174 return NULL;
4175}
Tim Petersced69f82003-09-16 20:30:58 +00004176
Guido van Rossum9e896b32000-04-05 20:11:21 +00004177/* --- Decimal Encoder ---------------------------------------------------- */
4178
4179int PyUnicode_EncodeDecimal(Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004180 Py_ssize_t length,
Guido van Rossum9e896b32000-04-05 20:11:21 +00004181 char *output,
4182 const char *errors)
4183{
4184 Py_UNICODE *p, *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004185 PyObject *errorHandler = NULL;
4186 PyObject *exc = NULL;
4187 const char *encoding = "decimal";
4188 const char *reason = "invalid decimal Unicode string";
4189 /* the following variable is used for caching string comparisons
4190 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
4191 int known_errorHandler = -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00004192
4193 if (output == NULL) {
4194 PyErr_BadArgument();
4195 return -1;
4196 }
4197
4198 p = s;
4199 end = s + length;
4200 while (p < end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004201 register Py_UNICODE ch = *p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00004202 int decimal;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004203 PyObject *repunicode;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004204 Py_ssize_t repsize;
4205 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004206 Py_UNICODE *uni2;
4207 Py_UNICODE *collstart;
4208 Py_UNICODE *collend;
Tim Petersced69f82003-09-16 20:30:58 +00004209
Guido van Rossum9e896b32000-04-05 20:11:21 +00004210 if (Py_UNICODE_ISSPACE(ch)) {
4211 *output++ = ' ';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004212 ++p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00004213 continue;
4214 }
4215 decimal = Py_UNICODE_TODECIMAL(ch);
4216 if (decimal >= 0) {
4217 *output++ = '0' + decimal;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004218 ++p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00004219 continue;
4220 }
Guido van Rossumba477042000-04-06 18:18:10 +00004221 if (0 < ch && ch < 256) {
Guido van Rossum42c29aa2000-05-03 23:58:29 +00004222 *output++ = (char)ch;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004223 ++p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00004224 continue;
4225 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004226 /* All other characters are considered unencodable */
4227 collstart = p;
4228 collend = p+1;
4229 while (collend < end) {
4230 if ((0 < *collend && *collend < 256) ||
4231 !Py_UNICODE_ISSPACE(*collend) ||
4232 Py_UNICODE_TODECIMAL(*collend))
4233 break;
Guido van Rossum9e896b32000-04-05 20:11:21 +00004234 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004235 /* cache callback name lookup
4236 * (if not done yet, i.e. it's the first error) */
4237 if (known_errorHandler==-1) {
4238 if ((errors==NULL) || (!strcmp(errors, "strict")))
4239 known_errorHandler = 1;
4240 else if (!strcmp(errors, "replace"))
4241 known_errorHandler = 2;
4242 else if (!strcmp(errors, "ignore"))
4243 known_errorHandler = 3;
4244 else if (!strcmp(errors, "xmlcharrefreplace"))
4245 known_errorHandler = 4;
4246 else
4247 known_errorHandler = 0;
4248 }
4249 switch (known_errorHandler) {
4250 case 1: /* strict */
4251 raise_encode_exception(&exc, encoding, s, length, collstart-s, collend-s, reason);
4252 goto onError;
4253 case 2: /* replace */
4254 for (p = collstart; p < collend; ++p)
4255 *output++ = '?';
4256 /* fall through */
4257 case 3: /* ignore */
4258 p = collend;
4259 break;
4260 case 4: /* xmlcharrefreplace */
4261 /* generate replacement (temporarily (mis)uses p) */
4262 for (p = collstart; p < collend; ++p)
4263 output += sprintf(output, "&#%d;", (int)*p);
4264 p = collend;
4265 break;
4266 default:
4267 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
4268 encoding, reason, s, length, &exc,
4269 collstart-s, collend-s, &newpos);
4270 if (repunicode == NULL)
4271 goto onError;
4272 /* generate replacement */
4273 repsize = PyUnicode_GET_SIZE(repunicode);
4274 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
4275 Py_UNICODE ch = *uni2;
4276 if (Py_UNICODE_ISSPACE(ch))
4277 *output++ = ' ';
4278 else {
4279 decimal = Py_UNICODE_TODECIMAL(ch);
4280 if (decimal >= 0)
4281 *output++ = '0' + decimal;
4282 else if (0 < ch && ch < 256)
4283 *output++ = (char)ch;
4284 else {
4285 Py_DECREF(repunicode);
4286 raise_encode_exception(&exc, encoding,
4287 s, length, collstart-s, collend-s, reason);
4288 goto onError;
4289 }
4290 }
4291 }
4292 p = s + newpos;
4293 Py_DECREF(repunicode);
Guido van Rossum9e896b32000-04-05 20:11:21 +00004294 }
4295 }
4296 /* 0-terminate the output string */
4297 *output++ = '\0';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004298 Py_XDECREF(exc);
4299 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00004300 return 0;
4301
4302 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004303 Py_XDECREF(exc);
4304 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00004305 return -1;
4306}
4307
Guido van Rossumd57fd912000-03-10 22:53:23 +00004308/* --- Helpers ------------------------------------------------------------ */
4309
Thomas Wouters477c8d52006-05-27 19:21:47 +00004310#define STRINGLIB_CHAR Py_UNICODE
4311
4312#define STRINGLIB_LEN PyUnicode_GET_SIZE
4313#define STRINGLIB_NEW PyUnicode_FromUnicode
4314#define STRINGLIB_STR PyUnicode_AS_UNICODE
4315
4316Py_LOCAL_INLINE(int)
4317STRINGLIB_CMP(const Py_UNICODE* str, const Py_UNICODE* other, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004318{
Thomas Wouters477c8d52006-05-27 19:21:47 +00004319 if (str[0] != other[0])
4320 return 1;
4321 return memcmp((void*) str, (void*) other, len * sizeof(Py_UNICODE));
Guido van Rossumd57fd912000-03-10 22:53:23 +00004322}
4323
Thomas Wouters477c8d52006-05-27 19:21:47 +00004324#define STRINGLIB_EMPTY unicode_empty
4325
4326#include "stringlib/fastsearch.h"
4327
4328#include "stringlib/count.h"
4329#include "stringlib/find.h"
4330#include "stringlib/partition.h"
4331
4332/* helper macro to fixup start/end slice values */
4333#define FIX_START_END(obj) \
4334 if (start < 0) \
4335 start += (obj)->length; \
4336 if (start < 0) \
4337 start = 0; \
4338 if (end > (obj)->length) \
4339 end = (obj)->length; \
4340 if (end < 0) \
4341 end += (obj)->length; \
4342 if (end < 0) \
4343 end = 0;
4344
Martin v. Löwis18e16552006-02-15 17:27:45 +00004345Py_ssize_t PyUnicode_Count(PyObject *str,
Thomas Wouters477c8d52006-05-27 19:21:47 +00004346 PyObject *substr,
4347 Py_ssize_t start,
4348 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004349{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004350 Py_ssize_t result;
Thomas Wouters477c8d52006-05-27 19:21:47 +00004351 PyUnicodeObject* str_obj;
4352 PyUnicodeObject* sub_obj;
Tim Petersced69f82003-09-16 20:30:58 +00004353
Thomas Wouters477c8d52006-05-27 19:21:47 +00004354 str_obj = (PyUnicodeObject*) PyUnicode_FromObject(str);
4355 if (!str_obj)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004356 return -1;
Thomas Wouters477c8d52006-05-27 19:21:47 +00004357 sub_obj = (PyUnicodeObject*) PyUnicode_FromObject(substr);
4358 if (!sub_obj) {
4359 Py_DECREF(str_obj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004360 return -1;
4361 }
Tim Petersced69f82003-09-16 20:30:58 +00004362
Thomas Wouters477c8d52006-05-27 19:21:47 +00004363 FIX_START_END(str_obj);
Tim Petersced69f82003-09-16 20:30:58 +00004364
Thomas Wouters477c8d52006-05-27 19:21:47 +00004365 result = stringlib_count(
4366 str_obj->str + start, end - start, sub_obj->str, sub_obj->length
4367 );
4368
4369 Py_DECREF(sub_obj);
4370 Py_DECREF(str_obj);
4371
Guido van Rossumd57fd912000-03-10 22:53:23 +00004372 return result;
4373}
4374
Martin v. Löwis18e16552006-02-15 17:27:45 +00004375Py_ssize_t PyUnicode_Find(PyObject *str,
Thomas Wouters477c8d52006-05-27 19:21:47 +00004376 PyObject *sub,
4377 Py_ssize_t start,
4378 Py_ssize_t end,
4379 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004380{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004381 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00004382
Guido van Rossumd57fd912000-03-10 22:53:23 +00004383 str = PyUnicode_FromObject(str);
Thomas Wouters477c8d52006-05-27 19:21:47 +00004384 if (!str)
Marc-André Lemburg4da6fd62002-05-29 11:33:13 +00004385 return -2;
Thomas Wouters477c8d52006-05-27 19:21:47 +00004386 sub = PyUnicode_FromObject(sub);
4387 if (!sub) {
Marc-André Lemburg41644392002-05-29 13:46:29 +00004388 Py_DECREF(str);
Marc-André Lemburg4da6fd62002-05-29 11:33:13 +00004389 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004390 }
Tim Petersced69f82003-09-16 20:30:58 +00004391
Thomas Wouters477c8d52006-05-27 19:21:47 +00004392 if (direction > 0)
4393 result = stringlib_find_slice(
4394 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
4395 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
4396 start, end
4397 );
4398 else
4399 result = stringlib_rfind_slice(
4400 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
4401 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
4402 start, end
4403 );
4404
Guido van Rossumd57fd912000-03-10 22:53:23 +00004405 Py_DECREF(str);
Thomas Wouters477c8d52006-05-27 19:21:47 +00004406 Py_DECREF(sub);
4407
Guido van Rossumd57fd912000-03-10 22:53:23 +00004408 return result;
4409}
4410
Tim Petersced69f82003-09-16 20:30:58 +00004411static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004412int tailmatch(PyUnicodeObject *self,
4413 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004414 Py_ssize_t start,
4415 Py_ssize_t end,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004416 int direction)
4417{
Guido van Rossumd57fd912000-03-10 22:53:23 +00004418 if (substring->length == 0)
4419 return 1;
4420
Thomas Wouters477c8d52006-05-27 19:21:47 +00004421 FIX_START_END(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004422
4423 end -= substring->length;
4424 if (end < start)
4425 return 0;
4426
4427 if (direction > 0) {
4428 if (Py_UNICODE_MATCH(self, end, substring))
4429 return 1;
4430 } else {
4431 if (Py_UNICODE_MATCH(self, start, substring))
4432 return 1;
4433 }
4434
4435 return 0;
4436}
4437
Martin v. Löwis18e16552006-02-15 17:27:45 +00004438Py_ssize_t PyUnicode_Tailmatch(PyObject *str,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004439 PyObject *substr,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004440 Py_ssize_t start,
4441 Py_ssize_t end,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004442 int direction)
4443{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004444 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00004445
Guido van Rossumd57fd912000-03-10 22:53:23 +00004446 str = PyUnicode_FromObject(str);
4447 if (str == NULL)
4448 return -1;
4449 substr = PyUnicode_FromObject(substr);
4450 if (substr == NULL) {
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00004451 Py_DECREF(str);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004452 return -1;
4453 }
Tim Petersced69f82003-09-16 20:30:58 +00004454
Guido van Rossumd57fd912000-03-10 22:53:23 +00004455 result = tailmatch((PyUnicodeObject *)str,
4456 (PyUnicodeObject *)substr,
4457 start, end, direction);
4458 Py_DECREF(str);
4459 Py_DECREF(substr);
4460 return result;
4461}
4462
Guido van Rossumd57fd912000-03-10 22:53:23 +00004463/* Apply fixfct filter to the Unicode object self and return a
4464 reference to the modified object */
4465
Tim Petersced69f82003-09-16 20:30:58 +00004466static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004467PyObject *fixup(PyUnicodeObject *self,
4468 int (*fixfct)(PyUnicodeObject *s))
4469{
4470
4471 PyUnicodeObject *u;
4472
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004473 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004474 if (u == NULL)
4475 return NULL;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004476
4477 Py_UNICODE_COPY(u->str, self->str, self->length);
4478
Tim Peters7a29bd52001-09-12 03:03:31 +00004479 if (!fixfct(u) && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004480 /* fixfct should return TRUE if it modified the buffer. If
4481 FALSE, return a reference to the original buffer instead
4482 (to save space, not time) */
4483 Py_INCREF(self);
4484 Py_DECREF(u);
4485 return (PyObject*) self;
4486 }
4487 return (PyObject*) u;
4488}
4489
Tim Petersced69f82003-09-16 20:30:58 +00004490static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004491int fixupper(PyUnicodeObject *self)
4492{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004493 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004494 Py_UNICODE *s = self->str;
4495 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00004496
Guido van Rossumd57fd912000-03-10 22:53:23 +00004497 while (len-- > 0) {
4498 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00004499
Guido van Rossumd57fd912000-03-10 22:53:23 +00004500 ch = Py_UNICODE_TOUPPER(*s);
4501 if (ch != *s) {
4502 status = 1;
4503 *s = ch;
4504 }
4505 s++;
4506 }
4507
4508 return status;
4509}
4510
Tim Petersced69f82003-09-16 20:30:58 +00004511static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004512int fixlower(PyUnicodeObject *self)
4513{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004514 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004515 Py_UNICODE *s = self->str;
4516 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00004517
Guido van Rossumd57fd912000-03-10 22:53:23 +00004518 while (len-- > 0) {
4519 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00004520
Guido van Rossumd57fd912000-03-10 22:53:23 +00004521 ch = Py_UNICODE_TOLOWER(*s);
4522 if (ch != *s) {
4523 status = 1;
4524 *s = ch;
4525 }
4526 s++;
4527 }
4528
4529 return status;
4530}
4531
Tim Petersced69f82003-09-16 20:30:58 +00004532static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004533int fixswapcase(PyUnicodeObject *self)
4534{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004535 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004536 Py_UNICODE *s = self->str;
4537 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00004538
Guido van Rossumd57fd912000-03-10 22:53:23 +00004539 while (len-- > 0) {
4540 if (Py_UNICODE_ISUPPER(*s)) {
4541 *s = Py_UNICODE_TOLOWER(*s);
4542 status = 1;
4543 } else if (Py_UNICODE_ISLOWER(*s)) {
4544 *s = Py_UNICODE_TOUPPER(*s);
4545 status = 1;
4546 }
4547 s++;
4548 }
4549
4550 return status;
4551}
4552
Tim Petersced69f82003-09-16 20:30:58 +00004553static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004554int fixcapitalize(PyUnicodeObject *self)
4555{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004556 Py_ssize_t len = self->length;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00004557 Py_UNICODE *s = self->str;
4558 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00004559
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00004560 if (len == 0)
4561 return 0;
4562 if (Py_UNICODE_ISLOWER(*s)) {
4563 *s = Py_UNICODE_TOUPPER(*s);
4564 status = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004565 }
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00004566 s++;
4567 while (--len > 0) {
4568 if (Py_UNICODE_ISUPPER(*s)) {
4569 *s = Py_UNICODE_TOLOWER(*s);
4570 status = 1;
4571 }
4572 s++;
4573 }
4574 return status;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004575}
4576
4577static
4578int fixtitle(PyUnicodeObject *self)
4579{
4580 register Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
4581 register Py_UNICODE *e;
4582 int previous_is_cased;
4583
4584 /* Shortcut for single character strings */
4585 if (PyUnicode_GET_SIZE(self) == 1) {
4586 Py_UNICODE ch = Py_UNICODE_TOTITLE(*p);
4587 if (*p != ch) {
4588 *p = ch;
4589 return 1;
4590 }
4591 else
4592 return 0;
4593 }
Tim Petersced69f82003-09-16 20:30:58 +00004594
Guido van Rossumd57fd912000-03-10 22:53:23 +00004595 e = p + PyUnicode_GET_SIZE(self);
4596 previous_is_cased = 0;
4597 for (; p < e; p++) {
4598 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00004599
Guido van Rossumd57fd912000-03-10 22:53:23 +00004600 if (previous_is_cased)
4601 *p = Py_UNICODE_TOLOWER(ch);
4602 else
4603 *p = Py_UNICODE_TOTITLE(ch);
Tim Petersced69f82003-09-16 20:30:58 +00004604
4605 if (Py_UNICODE_ISLOWER(ch) ||
4606 Py_UNICODE_ISUPPER(ch) ||
Guido van Rossumd57fd912000-03-10 22:53:23 +00004607 Py_UNICODE_ISTITLE(ch))
4608 previous_is_cased = 1;
4609 else
4610 previous_is_cased = 0;
4611 }
4612 return 1;
4613}
4614
Tim Peters8ce9f162004-08-27 01:49:32 +00004615PyObject *
4616PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004617{
Tim Peters8ce9f162004-08-27 01:49:32 +00004618 PyObject *internal_separator = NULL;
Skip Montanaro6543b452004-09-16 03:28:13 +00004619 const Py_UNICODE blank = ' ';
4620 const Py_UNICODE *sep = &blank;
Thomas Wouters477c8d52006-05-27 19:21:47 +00004621 Py_ssize_t seplen = 1;
Tim Peters05eba1f2004-08-27 21:32:02 +00004622 PyUnicodeObject *res = NULL; /* the result */
Thomas Wouters477c8d52006-05-27 19:21:47 +00004623 Py_ssize_t res_alloc = 100; /* # allocated bytes for string in res */
4624 Py_ssize_t res_used; /* # used bytes */
Tim Peters05eba1f2004-08-27 21:32:02 +00004625 Py_UNICODE *res_p; /* pointer to free byte in res's string area */
4626 PyObject *fseq; /* PySequence_Fast(seq) */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004627 Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */
Tim Peters8ce9f162004-08-27 01:49:32 +00004628 PyObject *item;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004629 Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004630
Tim Peters05eba1f2004-08-27 21:32:02 +00004631 fseq = PySequence_Fast(seq, "");
4632 if (fseq == NULL) {
Tim Peters05eba1f2004-08-27 21:32:02 +00004633 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +00004634 }
4635
Tim Peters91879ab2004-08-27 22:35:44 +00004636 /* Grrrr. A codec may be invoked to convert str objects to
4637 * Unicode, and so it's possible to call back into Python code
4638 * during PyUnicode_FromObject(), and so it's possible for a sick
4639 * codec to change the size of fseq (if seq is a list). Therefore
4640 * we have to keep refetching the size -- can't assume seqlen
4641 * is invariant.
4642 */
Tim Peters05eba1f2004-08-27 21:32:02 +00004643 seqlen = PySequence_Fast_GET_SIZE(fseq);
4644 /* If empty sequence, return u"". */
4645 if (seqlen == 0) {
4646 res = _PyUnicode_New(0); /* empty sequence; return u"" */
4647 goto Done;
4648 }
4649 /* If singleton sequence with an exact Unicode, return that. */
4650 if (seqlen == 1) {
4651 item = PySequence_Fast_GET_ITEM(fseq, 0);
4652 if (PyUnicode_CheckExact(item)) {
4653 Py_INCREF(item);
4654 res = (PyUnicodeObject *)item;
4655 goto Done;
4656 }
Tim Peters8ce9f162004-08-27 01:49:32 +00004657 }
4658
Tim Peters05eba1f2004-08-27 21:32:02 +00004659 /* At least two items to join, or one that isn't exact Unicode. */
4660 if (seqlen > 1) {
4661 /* Set up sep and seplen -- they're needed. */
4662 if (separator == NULL) {
4663 sep = &blank;
4664 seplen = 1;
4665 }
4666 else {
4667 internal_separator = PyUnicode_FromObject(separator);
4668 if (internal_separator == NULL)
4669 goto onError;
4670 sep = PyUnicode_AS_UNICODE(internal_separator);
4671 seplen = PyUnicode_GET_SIZE(internal_separator);
Tim Peters91879ab2004-08-27 22:35:44 +00004672 /* In case PyUnicode_FromObject() mutated seq. */
4673 seqlen = PySequence_Fast_GET_SIZE(fseq);
Tim Peters05eba1f2004-08-27 21:32:02 +00004674 }
4675 }
4676
4677 /* Get space. */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004678 res = _PyUnicode_New(res_alloc);
Tim Peters05eba1f2004-08-27 21:32:02 +00004679 if (res == NULL)
Tim Peters8ce9f162004-08-27 01:49:32 +00004680 goto onError;
Tim Peters05eba1f2004-08-27 21:32:02 +00004681 res_p = PyUnicode_AS_UNICODE(res);
4682 res_used = 0;
Tim Peters8ce9f162004-08-27 01:49:32 +00004683
Tim Peters05eba1f2004-08-27 21:32:02 +00004684 for (i = 0; i < seqlen; ++i) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00004685 Py_ssize_t itemlen;
4686 Py_ssize_t new_res_used;
Tim Peters05eba1f2004-08-27 21:32:02 +00004687
4688 item = PySequence_Fast_GET_ITEM(fseq, i);
4689 /* Convert item to Unicode. */
4690 if (! PyUnicode_Check(item) && ! PyString_Check(item)) {
4691 PyErr_Format(PyExc_TypeError,
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004692 "sequence item %zd: expected string or Unicode,"
Tim Peters05eba1f2004-08-27 21:32:02 +00004693 " %.80s found",
4694 i, item->ob_type->tp_name);
Tim Peters2cfe3682001-05-05 05:36:48 +00004695 goto onError;
Tim Peters8ce9f162004-08-27 01:49:32 +00004696 }
Tim Peters05eba1f2004-08-27 21:32:02 +00004697 item = PyUnicode_FromObject(item);
4698 if (item == NULL)
4699 goto onError;
4700 /* We own a reference to item from here on. */
Tim Petersced69f82003-09-16 20:30:58 +00004701
Tim Peters91879ab2004-08-27 22:35:44 +00004702 /* In case PyUnicode_FromObject() mutated seq. */
4703 seqlen = PySequence_Fast_GET_SIZE(fseq);
4704
Tim Peters8ce9f162004-08-27 01:49:32 +00004705 /* Make sure we have enough space for the separator and the item. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004706 itemlen = PyUnicode_GET_SIZE(item);
Tim Peters05eba1f2004-08-27 21:32:02 +00004707 new_res_used = res_used + itemlen;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004708 if (new_res_used < 0)
Tim Peters8ce9f162004-08-27 01:49:32 +00004709 goto Overflow;
Tim Peters05eba1f2004-08-27 21:32:02 +00004710 if (i < seqlen - 1) {
4711 new_res_used += seplen;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004712 if (new_res_used < 0)
Tim Peters05eba1f2004-08-27 21:32:02 +00004713 goto Overflow;
4714 }
4715 if (new_res_used > res_alloc) {
4716 /* double allocated size until it's big enough */
Tim Peters8ce9f162004-08-27 01:49:32 +00004717 do {
Tim Peters05eba1f2004-08-27 21:32:02 +00004718 res_alloc += res_alloc;
Thomas Wouters477c8d52006-05-27 19:21:47 +00004719 if (res_alloc <= 0)
Tim Peters8ce9f162004-08-27 01:49:32 +00004720 goto Overflow;
Tim Peters05eba1f2004-08-27 21:32:02 +00004721 } while (new_res_used > res_alloc);
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004722 if (_PyUnicode_Resize(&res, res_alloc) < 0) {
Marc-André Lemburg3508e302001-09-20 17:22:58 +00004723 Py_DECREF(item);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004724 goto onError;
Marc-André Lemburg3508e302001-09-20 17:22:58 +00004725 }
Tim Peters05eba1f2004-08-27 21:32:02 +00004726 res_p = PyUnicode_AS_UNICODE(res) + res_used;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004727 }
Tim Peters05eba1f2004-08-27 21:32:02 +00004728
4729 /* Copy item, and maybe the separator. */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004730 Py_UNICODE_COPY(res_p, PyUnicode_AS_UNICODE(item), itemlen);
Tim Peters05eba1f2004-08-27 21:32:02 +00004731 res_p += itemlen;
4732 if (i < seqlen - 1) {
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004733 Py_UNICODE_COPY(res_p, sep, seplen);
Tim Peters05eba1f2004-08-27 21:32:02 +00004734 res_p += seplen;
4735 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004736 Py_DECREF(item);
Tim Peters05eba1f2004-08-27 21:32:02 +00004737 res_used = new_res_used;
4738 }
Tim Peters8ce9f162004-08-27 01:49:32 +00004739
Tim Peters05eba1f2004-08-27 21:32:02 +00004740 /* Shrink res to match the used area; this probably can't fail,
4741 * but it's cheap to check.
4742 */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004743 if (_PyUnicode_Resize(&res, res_used) < 0)
Tim Peters8ce9f162004-08-27 01:49:32 +00004744 goto onError;
4745
4746 Done:
4747 Py_XDECREF(internal_separator);
Tim Peters05eba1f2004-08-27 21:32:02 +00004748 Py_DECREF(fseq);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004749 return (PyObject *)res;
4750
Tim Peters8ce9f162004-08-27 01:49:32 +00004751 Overflow:
4752 PyErr_SetString(PyExc_OverflowError,
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004753 "join() result is too long for a Python string");
Tim Peters8ce9f162004-08-27 01:49:32 +00004754 Py_DECREF(item);
4755 /* fall through */
4756
Guido van Rossumd57fd912000-03-10 22:53:23 +00004757 onError:
Tim Peters8ce9f162004-08-27 01:49:32 +00004758 Py_XDECREF(internal_separator);
Tim Peters05eba1f2004-08-27 21:32:02 +00004759 Py_DECREF(fseq);
Tim Peters8ce9f162004-08-27 01:49:32 +00004760 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004761 return NULL;
4762}
4763
Tim Petersced69f82003-09-16 20:30:58 +00004764static
4765PyUnicodeObject *pad(PyUnicodeObject *self,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004766 Py_ssize_t left,
4767 Py_ssize_t right,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004768 Py_UNICODE fill)
4769{
4770 PyUnicodeObject *u;
4771
4772 if (left < 0)
4773 left = 0;
4774 if (right < 0)
4775 right = 0;
4776
Tim Peters7a29bd52001-09-12 03:03:31 +00004777 if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004778 Py_INCREF(self);
4779 return self;
4780 }
4781
4782 u = _PyUnicode_New(left + self->length + right);
4783 if (u) {
4784 if (left)
4785 Py_UNICODE_FILL(u->str, fill, left);
4786 Py_UNICODE_COPY(u->str + left, self->str, self->length);
4787 if (right)
4788 Py_UNICODE_FILL(u->str + left + self->length, fill, right);
4789 }
4790
4791 return u;
4792}
4793
4794#define SPLIT_APPEND(data, left, right) \
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004795 str = PyUnicode_FromUnicode((data) + (left), (right) - (left)); \
Guido van Rossumd57fd912000-03-10 22:53:23 +00004796 if (!str) \
4797 goto onError; \
4798 if (PyList_Append(list, str)) { \
4799 Py_DECREF(str); \
4800 goto onError; \
4801 } \
4802 else \
4803 Py_DECREF(str);
4804
4805static
4806PyObject *split_whitespace(PyUnicodeObject *self,
4807 PyObject *list,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004808 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004809{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004810 register Py_ssize_t i;
4811 register Py_ssize_t j;
4812 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004813 PyObject *str;
4814
4815 for (i = j = 0; i < len; ) {
4816 /* find a token */
4817 while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
4818 i++;
4819 j = i;
4820 while (i < len && !Py_UNICODE_ISSPACE(self->str[i]))
4821 i++;
4822 if (j < i) {
4823 if (maxcount-- <= 0)
4824 break;
4825 SPLIT_APPEND(self->str, j, i);
4826 while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
4827 i++;
4828 j = i;
4829 }
4830 }
4831 if (j < len) {
4832 SPLIT_APPEND(self->str, j, len);
4833 }
4834 return list;
4835
4836 onError:
4837 Py_DECREF(list);
4838 return NULL;
4839}
4840
4841PyObject *PyUnicode_Splitlines(PyObject *string,
Guido van Rossum86662912000-04-11 15:38:46 +00004842 int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004843{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004844 register Py_ssize_t i;
4845 register Py_ssize_t j;
4846 Py_ssize_t len;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004847 PyObject *list;
4848 PyObject *str;
4849 Py_UNICODE *data;
4850
4851 string = PyUnicode_FromObject(string);
4852 if (string == NULL)
4853 return NULL;
4854 data = PyUnicode_AS_UNICODE(string);
4855 len = PyUnicode_GET_SIZE(string);
4856
Guido van Rossumd57fd912000-03-10 22:53:23 +00004857 list = PyList_New(0);
4858 if (!list)
4859 goto onError;
4860
4861 for (i = j = 0; i < len; ) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00004862 Py_ssize_t eol;
Tim Petersced69f82003-09-16 20:30:58 +00004863
Guido van Rossumd57fd912000-03-10 22:53:23 +00004864 /* Find a line and append it */
Thomas Wouters477c8d52006-05-27 19:21:47 +00004865 while (i < len && !BLOOM_LINEBREAK(data[i]))
Guido van Rossumd57fd912000-03-10 22:53:23 +00004866 i++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004867
4868 /* Skip the line break reading CRLF as one line break */
Guido van Rossum86662912000-04-11 15:38:46 +00004869 eol = i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004870 if (i < len) {
4871 if (data[i] == '\r' && i + 1 < len &&
4872 data[i+1] == '\n')
4873 i += 2;
4874 else
4875 i++;
Guido van Rossum86662912000-04-11 15:38:46 +00004876 if (keepends)
4877 eol = i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004878 }
Guido van Rossum86662912000-04-11 15:38:46 +00004879 SPLIT_APPEND(data, j, eol);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004880 j = i;
4881 }
4882 if (j < len) {
4883 SPLIT_APPEND(data, j, len);
4884 }
4885
4886 Py_DECREF(string);
4887 return list;
4888
4889 onError:
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00004890 Py_XDECREF(list);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004891 Py_DECREF(string);
4892 return NULL;
4893}
4894
Tim Petersced69f82003-09-16 20:30:58 +00004895static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004896PyObject *split_char(PyUnicodeObject *self,
4897 PyObject *list,
4898 Py_UNICODE ch,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004899 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004900{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004901 register Py_ssize_t i;
4902 register Py_ssize_t j;
4903 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004904 PyObject *str;
4905
4906 for (i = j = 0; i < len; ) {
4907 if (self->str[i] == ch) {
4908 if (maxcount-- <= 0)
4909 break;
4910 SPLIT_APPEND(self->str, j, i);
4911 i = j = i + 1;
4912 } else
4913 i++;
4914 }
4915 if (j <= len) {
4916 SPLIT_APPEND(self->str, j, len);
4917 }
4918 return list;
4919
4920 onError:
4921 Py_DECREF(list);
4922 return NULL;
4923}
4924
Tim Petersced69f82003-09-16 20:30:58 +00004925static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004926PyObject *split_substring(PyUnicodeObject *self,
4927 PyObject *list,
4928 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004929 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004930{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004931 register Py_ssize_t i;
4932 register Py_ssize_t j;
4933 Py_ssize_t len = self->length;
4934 Py_ssize_t sublen = substring->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004935 PyObject *str;
4936
Guido van Rossumcda4f9a2000-12-19 02:23:19 +00004937 for (i = j = 0; i <= len - sublen; ) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004938 if (Py_UNICODE_MATCH(self, i, substring)) {
4939 if (maxcount-- <= 0)
4940 break;
4941 SPLIT_APPEND(self->str, j, i);
4942 i = j = i + sublen;
4943 } else
4944 i++;
4945 }
4946 if (j <= len) {
4947 SPLIT_APPEND(self->str, j, len);
4948 }
4949 return list;
4950
4951 onError:
4952 Py_DECREF(list);
4953 return NULL;
4954}
4955
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004956static
4957PyObject *rsplit_whitespace(PyUnicodeObject *self,
4958 PyObject *list,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004959 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004960{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004961 register Py_ssize_t i;
4962 register Py_ssize_t j;
4963 Py_ssize_t len = self->length;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004964 PyObject *str;
4965
4966 for (i = j = len - 1; i >= 0; ) {
4967 /* find a token */
4968 while (i >= 0 && Py_UNICODE_ISSPACE(self->str[i]))
4969 i--;
4970 j = i;
4971 while (i >= 0 && !Py_UNICODE_ISSPACE(self->str[i]))
4972 i--;
4973 if (j > i) {
4974 if (maxcount-- <= 0)
4975 break;
Thomas Wouters477c8d52006-05-27 19:21:47 +00004976 SPLIT_APPEND(self->str, i + 1, j + 1);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004977 while (i >= 0 && Py_UNICODE_ISSPACE(self->str[i]))
4978 i--;
4979 j = i;
4980 }
4981 }
4982 if (j >= 0) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00004983 SPLIT_APPEND(self->str, 0, j + 1);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004984 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00004985 if (PyList_Reverse(list) < 0)
4986 goto onError;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004987 return list;
4988
4989 onError:
4990 Py_DECREF(list);
4991 return NULL;
4992}
4993
4994static
4995PyObject *rsplit_char(PyUnicodeObject *self,
4996 PyObject *list,
4997 Py_UNICODE ch,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004998 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004999{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005000 register Py_ssize_t i;
5001 register Py_ssize_t j;
5002 Py_ssize_t len = self->length;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005003 PyObject *str;
5004
5005 for (i = j = len - 1; i >= 0; ) {
5006 if (self->str[i] == ch) {
5007 if (maxcount-- <= 0)
5008 break;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005009 SPLIT_APPEND(self->str, i + 1, j + 1);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005010 j = i = i - 1;
5011 } else
5012 i--;
5013 }
Hye-Shik Chang7fc4cf52003-12-23 09:10:16 +00005014 if (j >= -1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00005015 SPLIT_APPEND(self->str, 0, j + 1);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005016 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00005017 if (PyList_Reverse(list) < 0)
5018 goto onError;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005019 return list;
5020
5021 onError:
5022 Py_DECREF(list);
5023 return NULL;
5024}
5025
5026static
5027PyObject *rsplit_substring(PyUnicodeObject *self,
5028 PyObject *list,
5029 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005030 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005031{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005032 register Py_ssize_t i;
5033 register Py_ssize_t j;
5034 Py_ssize_t len = self->length;
5035 Py_ssize_t sublen = substring->length;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005036 PyObject *str;
5037
5038 for (i = len - sublen, j = len; i >= 0; ) {
5039 if (Py_UNICODE_MATCH(self, i, substring)) {
5040 if (maxcount-- <= 0)
5041 break;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005042 SPLIT_APPEND(self->str, i + sublen, j);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005043 j = i;
5044 i -= sublen;
5045 } else
5046 i--;
5047 }
5048 if (j >= 0) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00005049 SPLIT_APPEND(self->str, 0, j);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005050 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00005051 if (PyList_Reverse(list) < 0)
5052 goto onError;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005053 return list;
5054
5055 onError:
5056 Py_DECREF(list);
5057 return NULL;
5058}
5059
Guido van Rossumd57fd912000-03-10 22:53:23 +00005060#undef SPLIT_APPEND
5061
5062static
5063PyObject *split(PyUnicodeObject *self,
5064 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005065 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005066{
5067 PyObject *list;
5068
5069 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005070 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005071
5072 list = PyList_New(0);
5073 if (!list)
5074 return NULL;
5075
5076 if (substring == NULL)
5077 return split_whitespace(self,list,maxcount);
5078
5079 else if (substring->length == 1)
5080 return split_char(self,list,substring->str[0],maxcount);
5081
5082 else if (substring->length == 0) {
5083 Py_DECREF(list);
5084 PyErr_SetString(PyExc_ValueError, "empty separator");
5085 return NULL;
5086 }
5087 else
5088 return split_substring(self,list,substring,maxcount);
5089}
5090
Tim Petersced69f82003-09-16 20:30:58 +00005091static
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005092PyObject *rsplit(PyUnicodeObject *self,
5093 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005094 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005095{
5096 PyObject *list;
5097
5098 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005099 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005100
5101 list = PyList_New(0);
5102 if (!list)
5103 return NULL;
5104
5105 if (substring == NULL)
5106 return rsplit_whitespace(self,list,maxcount);
5107
5108 else if (substring->length == 1)
5109 return rsplit_char(self,list,substring->str[0],maxcount);
5110
5111 else if (substring->length == 0) {
5112 Py_DECREF(list);
5113 PyErr_SetString(PyExc_ValueError, "empty separator");
5114 return NULL;
5115 }
5116 else
5117 return rsplit_substring(self,list,substring,maxcount);
5118}
5119
5120static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005121PyObject *replace(PyUnicodeObject *self,
5122 PyUnicodeObject *str1,
5123 PyUnicodeObject *str2,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005124 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005125{
5126 PyUnicodeObject *u;
5127
5128 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005129 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005130
Thomas Wouters477c8d52006-05-27 19:21:47 +00005131 if (str1->length == str2->length) {
5132 /* same length */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005133 Py_ssize_t i;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005134 if (str1->length == 1) {
5135 /* replace characters */
5136 Py_UNICODE u1, u2;
5137 if (!findchar(self->str, self->length, str1->str[0]))
5138 goto nothing;
5139 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
5140 if (!u)
5141 return NULL;
5142 Py_UNICODE_COPY(u->str, self->str, self->length);
5143 u1 = str1->str[0];
5144 u2 = str2->str[0];
5145 for (i = 0; i < u->length; i++)
5146 if (u->str[i] == u1) {
5147 if (--maxcount < 0)
5148 break;
5149 u->str[i] = u2;
5150 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005151 } else {
Thomas Wouters477c8d52006-05-27 19:21:47 +00005152 i = fastsearch(
5153 self->str, self->length, str1->str, str1->length, FAST_SEARCH
Guido van Rossumd57fd912000-03-10 22:53:23 +00005154 );
Thomas Wouters477c8d52006-05-27 19:21:47 +00005155 if (i < 0)
5156 goto nothing;
5157 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
5158 if (!u)
5159 return NULL;
5160 Py_UNICODE_COPY(u->str, self->str, self->length);
5161 while (i <= self->length - str1->length)
5162 if (Py_UNICODE_MATCH(self, i, str1)) {
5163 if (--maxcount < 0)
5164 break;
5165 Py_UNICODE_COPY(u->str+i, str2->str, str2->length);
5166 i += str1->length;
5167 } else
5168 i++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005169 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005170 } else {
Thomas Wouters477c8d52006-05-27 19:21:47 +00005171
5172 Py_ssize_t n, i, j, e;
5173 Py_ssize_t product, new_size, delta;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005174 Py_UNICODE *p;
5175
5176 /* replace strings */
Thomas Wouters477c8d52006-05-27 19:21:47 +00005177 n = stringlib_count(self->str, self->length, str1->str, str1->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005178 if (n > maxcount)
5179 n = maxcount;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005180 if (n == 0)
5181 goto nothing;
5182 /* new_size = self->length + n * (str2->length - str1->length)); */
5183 delta = (str2->length - str1->length);
5184 if (delta == 0) {
5185 new_size = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005186 } else {
Thomas Wouters477c8d52006-05-27 19:21:47 +00005187 product = n * (str2->length - str1->length);
5188 if ((product / (str2->length - str1->length)) != n) {
5189 PyErr_SetString(PyExc_OverflowError,
5190 "replace string is too long");
5191 return NULL;
5192 }
5193 new_size = self->length + product;
5194 if (new_size < 0) {
5195 PyErr_SetString(PyExc_OverflowError,
5196 "replace string is too long");
5197 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005198 }
5199 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00005200 u = _PyUnicode_New(new_size);
5201 if (!u)
5202 return NULL;
5203 i = 0;
5204 p = u->str;
5205 e = self->length - str1->length;
5206 if (str1->length > 0) {
5207 while (n-- > 0) {
5208 /* look for next match */
5209 j = i;
5210 while (j <= e) {
5211 if (Py_UNICODE_MATCH(self, j, str1))
5212 break;
5213 j++;
5214 }
5215 if (j > i) {
5216 if (j > e)
5217 break;
5218 /* copy unchanged part [i:j] */
5219 Py_UNICODE_COPY(p, self->str+i, j-i);
5220 p += j - i;
5221 }
5222 /* copy substitution string */
5223 if (str2->length > 0) {
5224 Py_UNICODE_COPY(p, str2->str, str2->length);
5225 p += str2->length;
5226 }
5227 i = j + str1->length;
5228 }
5229 if (i < self->length)
5230 /* copy tail [i:] */
5231 Py_UNICODE_COPY(p, self->str+i, self->length-i);
5232 } else {
5233 /* interleave */
5234 while (n > 0) {
5235 Py_UNICODE_COPY(p, str2->str, str2->length);
5236 p += str2->length;
5237 if (--n <= 0)
5238 break;
5239 *p++ = self->str[i++];
5240 }
5241 Py_UNICODE_COPY(p, self->str+i, self->length-i);
5242 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005243 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005244 return (PyObject *) u;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005245
5246nothing:
5247 /* nothing to replace; return original string (when possible) */
5248 if (PyUnicode_CheckExact(self)) {
5249 Py_INCREF(self);
5250 return (PyObject *) self;
5251 }
5252 return PyUnicode_FromUnicode(self->str, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005253}
5254
5255/* --- Unicode Object Methods --------------------------------------------- */
5256
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005257PyDoc_STRVAR(title__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005258"S.title() -> unicode\n\
5259\n\
5260Return a titlecased version of S, i.e. words start with title case\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005261characters, all remaining cased characters have lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005262
5263static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005264unicode_title(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005265{
Guido van Rossumd57fd912000-03-10 22:53:23 +00005266 return fixup(self, fixtitle);
5267}
5268
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005269PyDoc_STRVAR(capitalize__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005270"S.capitalize() -> unicode\n\
5271\n\
5272Return a capitalized version of S, i.e. make the first character\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005273have upper case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005274
5275static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005276unicode_capitalize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005277{
Guido van Rossumd57fd912000-03-10 22:53:23 +00005278 return fixup(self, fixcapitalize);
5279}
5280
5281#if 0
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005282PyDoc_STRVAR(capwords__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005283"S.capwords() -> unicode\n\
5284\n\
5285Apply .capitalize() to all words in S and return the result with\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005286normalized whitespace (all whitespace strings are replaced by ' ').");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005287
5288static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005289unicode_capwords(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005290{
5291 PyObject *list;
5292 PyObject *item;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005293 Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005294
Guido van Rossumd57fd912000-03-10 22:53:23 +00005295 /* Split into words */
5296 list = split(self, NULL, -1);
5297 if (!list)
5298 return NULL;
5299
5300 /* Capitalize each word */
5301 for (i = 0; i < PyList_GET_SIZE(list); i++) {
5302 item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
5303 fixcapitalize);
5304 if (item == NULL)
5305 goto onError;
5306 Py_DECREF(PyList_GET_ITEM(list, i));
5307 PyList_SET_ITEM(list, i, item);
5308 }
5309
5310 /* Join the words to form a new string */
5311 item = PyUnicode_Join(NULL, list);
5312
5313onError:
5314 Py_DECREF(list);
5315 return (PyObject *)item;
5316}
5317#endif
5318
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00005319/* Argument converter. Coerces to a single unicode character */
5320
5321static int
5322convert_uc(PyObject *obj, void *addr)
5323{
5324 Py_UNICODE *fillcharloc = (Py_UNICODE *)addr;
5325 PyObject *uniobj;
5326 Py_UNICODE *unistr;
5327
5328 uniobj = PyUnicode_FromObject(obj);
5329 if (uniobj == NULL) {
5330 PyErr_SetString(PyExc_TypeError,
5331 "The fill character cannot be converted to Unicode");
5332 return 0;
5333 }
5334 if (PyUnicode_GET_SIZE(uniobj) != 1) {
5335 PyErr_SetString(PyExc_TypeError,
5336 "The fill character must be exactly one character long");
5337 Py_DECREF(uniobj);
5338 return 0;
5339 }
5340 unistr = PyUnicode_AS_UNICODE(uniobj);
5341 *fillcharloc = unistr[0];
5342 Py_DECREF(uniobj);
5343 return 1;
5344}
5345
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005346PyDoc_STRVAR(center__doc__,
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00005347"S.center(width[, fillchar]) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005348\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00005349Return S centered in a Unicode string of length width. Padding is\n\
5350done using the specified fill character (default is a space)");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005351
5352static PyObject *
5353unicode_center(PyUnicodeObject *self, PyObject *args)
5354{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005355 Py_ssize_t marg, left;
5356 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00005357 Py_UNICODE fillchar = ' ';
Guido van Rossumd57fd912000-03-10 22:53:23 +00005358
Thomas Woutersde017742006-02-16 19:34:37 +00005359 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005360 return NULL;
5361
Tim Peters7a29bd52001-09-12 03:03:31 +00005362 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005363 Py_INCREF(self);
5364 return (PyObject*) self;
5365 }
5366
5367 marg = width - self->length;
5368 left = marg / 2 + (marg & width & 1);
5369
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00005370 return (PyObject*) pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005371}
5372
Marc-André Lemburge5034372000-08-08 08:04:29 +00005373#if 0
5374
5375/* This code should go into some future Unicode collation support
5376 module. The basic comparison should compare ordinals on a naive
Trent Mick20abf572000-08-12 22:14:34 +00005377 basis (this is what Java does and thus JPython too). */
Marc-André Lemburge5034372000-08-08 08:04:29 +00005378
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005379/* speedy UTF-16 code point order comparison */
5380/* gleaned from: */
5381/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
5382
Marc-André Lemburge12896e2000-07-07 17:51:08 +00005383static short utf16Fixup[32] =
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005384{
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005385 0, 0, 0, 0, 0, 0, 0, 0,
Tim Petersced69f82003-09-16 20:30:58 +00005386 0, 0, 0, 0, 0, 0, 0, 0,
5387 0, 0, 0, 0, 0, 0, 0, 0,
Marc-André Lemburge12896e2000-07-07 17:51:08 +00005388 0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005389};
5390
Guido van Rossumd57fd912000-03-10 22:53:23 +00005391static int
5392unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
5393{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005394 Py_ssize_t len1, len2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005395
Guido van Rossumd57fd912000-03-10 22:53:23 +00005396 Py_UNICODE *s1 = str1->str;
5397 Py_UNICODE *s2 = str2->str;
5398
5399 len1 = str1->length;
5400 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00005401
Guido van Rossumd57fd912000-03-10 22:53:23 +00005402 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00005403 Py_UNICODE c1, c2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005404
5405 c1 = *s1++;
5406 c2 = *s2++;
Fredrik Lundh45714e92001-06-26 16:39:36 +00005407
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005408 if (c1 > (1<<11) * 26)
5409 c1 += utf16Fixup[c1>>11];
5410 if (c2 > (1<<11) * 26)
5411 c2 += utf16Fixup[c2>>11];
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005412 /* now c1 and c2 are in UTF-32-compatible order */
Fredrik Lundh45714e92001-06-26 16:39:36 +00005413
5414 if (c1 != c2)
5415 return (c1 < c2) ? -1 : 1;
Tim Petersced69f82003-09-16 20:30:58 +00005416
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005417 len1--; len2--;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005418 }
5419
5420 return (len1 < len2) ? -1 : (len1 != len2);
5421}
5422
Marc-André Lemburge5034372000-08-08 08:04:29 +00005423#else
5424
5425static int
5426unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
5427{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005428 register Py_ssize_t len1, len2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00005429
5430 Py_UNICODE *s1 = str1->str;
5431 Py_UNICODE *s2 = str2->str;
5432
5433 len1 = str1->length;
5434 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00005435
Marc-André Lemburge5034372000-08-08 08:04:29 +00005436 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00005437 Py_UNICODE c1, c2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00005438
Fredrik Lundh45714e92001-06-26 16:39:36 +00005439 c1 = *s1++;
5440 c2 = *s2++;
5441
5442 if (c1 != c2)
5443 return (c1 < c2) ? -1 : 1;
5444
Marc-André Lemburge5034372000-08-08 08:04:29 +00005445 len1--; len2--;
5446 }
5447
5448 return (len1 < len2) ? -1 : (len1 != len2);
5449}
5450
5451#endif
5452
Guido van Rossumd57fd912000-03-10 22:53:23 +00005453int PyUnicode_Compare(PyObject *left,
5454 PyObject *right)
5455{
Guido van Rossum09dc34f2007-05-04 04:17:33 +00005456 if (PyUnicode_Check(left) && PyUnicode_Check(right))
5457 return unicode_compare((PyUnicodeObject *)left,
5458 (PyUnicodeObject *)right);
5459 if ((PyString_Check(left) && PyUnicode_Check(right)) ||
5460 (PyUnicode_Check(left) && PyString_Check(right))) {
5461 if (PyUnicode_Check(left))
5462 left = _PyUnicode_AsDefaultEncodedString(left, NULL);
5463 if (PyUnicode_Check(right))
5464 right = _PyUnicode_AsDefaultEncodedString(right, NULL);
5465 assert(PyString_Check(left));
5466 assert(PyString_Check(right));
5467 return PyObject_Compare(left, right);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005468 }
Guido van Rossum09dc34f2007-05-04 04:17:33 +00005469 PyErr_Format(PyExc_TypeError,
5470 "Can't compare %.100s and %.100s",
5471 left->ob_type->tp_name,
5472 right->ob_type->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005473 return -1;
5474}
5475
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00005476PyObject *PyUnicode_RichCompare(PyObject *left,
5477 PyObject *right,
5478 int op)
5479{
5480 int result;
5481
5482 result = PyUnicode_Compare(left, right);
5483 if (result == -1 && PyErr_Occurred())
5484 goto onError;
5485
5486 /* Convert the return value to a Boolean */
5487 switch (op) {
5488 case Py_EQ:
5489 result = (result == 0);
5490 break;
5491 case Py_NE:
5492 result = (result != 0);
5493 break;
5494 case Py_LE:
5495 result = (result <= 0);
5496 break;
5497 case Py_GE:
5498 result = (result >= 0);
5499 break;
5500 case Py_LT:
5501 result = (result == -1);
5502 break;
5503 case Py_GT:
5504 result = (result == 1);
5505 break;
5506 }
5507 return PyBool_FromLong(result);
5508
5509 onError:
5510
5511 /* Standard case
5512
5513 Type errors mean that PyUnicode_FromObject() could not convert
5514 one of the arguments (usually the right hand side) to Unicode,
5515 ie. we can't handle the comparison request. However, it is
5516 possible that the other object knows a comparison method, which
5517 is why we return Py_NotImplemented to give the other object a
5518 chance.
5519
5520 */
5521 if (PyErr_ExceptionMatches(PyExc_TypeError)) {
5522 PyErr_Clear();
5523 Py_INCREF(Py_NotImplemented);
5524 return Py_NotImplemented;
5525 }
5526 if (op != Py_EQ && op != Py_NE)
5527 return NULL;
5528
5529 /* Equality comparison.
5530
5531 This is a special case: we silence any PyExc_UnicodeDecodeError
5532 and instead turn it into a PyErr_UnicodeWarning.
5533
5534 */
5535 if (!PyErr_ExceptionMatches(PyExc_UnicodeDecodeError))
5536 return NULL;
5537 PyErr_Clear();
5538 if (PyErr_Warn(PyExc_UnicodeWarning,
5539 (op == Py_EQ) ?
5540 "Unicode equal comparison "
5541 "failed to convert both arguments to Unicode - "
5542 "interpreting them as being unequal" :
5543 "Unicode unequal comparison "
5544 "failed to convert both arguments to Unicode - "
5545 "interpreting them as being unequal"
5546 ) < 0)
5547 return NULL;
5548 result = (op == Py_NE);
5549 return PyBool_FromLong(result);
5550}
5551
Guido van Rossum403d68b2000-03-13 15:55:09 +00005552int PyUnicode_Contains(PyObject *container,
5553 PyObject *element)
5554{
Thomas Wouters477c8d52006-05-27 19:21:47 +00005555 PyObject *str, *sub;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005556 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00005557
5558 /* Coerce the two arguments */
Thomas Wouters477c8d52006-05-27 19:21:47 +00005559 sub = PyUnicode_FromObject(element);
5560 if (!sub) {
Marc-André Lemburg7c014682000-06-28 08:11:47 +00005561 PyErr_SetString(PyExc_TypeError,
Barry Warsaw817918c2002-08-06 16:58:21 +00005562 "'in <string>' requires string as left operand");
Thomas Wouters477c8d52006-05-27 19:21:47 +00005563 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +00005564 }
5565
Thomas Wouters477c8d52006-05-27 19:21:47 +00005566 str = PyUnicode_FromObject(container);
5567 if (!str) {
5568 Py_DECREF(sub);
5569 return -1;
5570 }
5571
5572 result = stringlib_contains_obj(str, sub);
5573
5574 Py_DECREF(str);
5575 Py_DECREF(sub);
5576
Guido van Rossum403d68b2000-03-13 15:55:09 +00005577 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00005578}
5579
Guido van Rossumd57fd912000-03-10 22:53:23 +00005580/* Concat to string or Unicode object giving a new Unicode object. */
5581
5582PyObject *PyUnicode_Concat(PyObject *left,
5583 PyObject *right)
5584{
5585 PyUnicodeObject *u = NULL, *v = NULL, *w;
5586
Guido van Rossum84d79dd2007-04-13 02:23:57 +00005587 if (PyBytes_Check(left) || PyBytes_Check(right))
5588 return PyBytes_Concat(left, right);
5589
Guido van Rossumd57fd912000-03-10 22:53:23 +00005590 /* Coerce the two arguments */
5591 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
5592 if (u == NULL)
5593 goto onError;
5594 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
5595 if (v == NULL)
5596 goto onError;
5597
5598 /* Shortcuts */
5599 if (v == unicode_empty) {
5600 Py_DECREF(v);
5601 return (PyObject *)u;
5602 }
5603 if (u == unicode_empty) {
5604 Py_DECREF(u);
5605 return (PyObject *)v;
5606 }
5607
5608 /* Concat the two Unicode strings */
5609 w = _PyUnicode_New(u->length + v->length);
5610 if (w == NULL)
5611 goto onError;
5612 Py_UNICODE_COPY(w->str, u->str, u->length);
5613 Py_UNICODE_COPY(w->str + u->length, v->str, v->length);
5614
5615 Py_DECREF(u);
5616 Py_DECREF(v);
5617 return (PyObject *)w;
5618
5619onError:
5620 Py_XDECREF(u);
5621 Py_XDECREF(v);
5622 return NULL;
5623}
5624
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005625PyDoc_STRVAR(count__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005626"S.count(sub[, start[, end]]) -> int\n\
5627\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00005628Return the number of non-overlapping occurrences of substring sub in\n\
5629Unicode string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005630interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005631
5632static PyObject *
5633unicode_count(PyUnicodeObject *self, PyObject *args)
5634{
5635 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005636 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005637 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005638 PyObject *result;
5639
Guido van Rossumb8872e62000-05-09 14:14:27 +00005640 if (!PyArg_ParseTuple(args, "O|O&O&:count", &substring,
5641 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005642 return NULL;
5643
5644 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Thomas Wouters477c8d52006-05-27 19:21:47 +00005645 (PyObject *)substring);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005646 if (substring == NULL)
5647 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +00005648
Thomas Wouters477c8d52006-05-27 19:21:47 +00005649 FIX_START_END(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005650
Thomas Wouters477c8d52006-05-27 19:21:47 +00005651 result = PyInt_FromSsize_t(
5652 stringlib_count(self->str + start, end - start,
5653 substring->str, substring->length)
5654 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00005655
5656 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00005657
Guido van Rossumd57fd912000-03-10 22:53:23 +00005658 return result;
5659}
5660
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005661PyDoc_STRVAR(encode__doc__,
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00005662"S.encode([encoding[,errors]]) -> string or unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005663\n\
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00005664Encodes S using the codec registered for encoding. encoding defaults\n\
5665to the default encoding. errors may be given to set a different error\n\
Fred Drakee4315f52000-05-09 19:53:39 +00005666handling scheme. Default is 'strict' meaning that encoding errors raise\n\
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005667a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
5668'xmlcharrefreplace' as well as any other name registered with\n\
5669codecs.register_error that can handle UnicodeEncodeErrors.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005670
5671static PyObject *
5672unicode_encode(PyUnicodeObject *self, PyObject *args)
5673{
5674 char *encoding = NULL;
5675 char *errors = NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00005676 PyObject *v;
5677
Guido van Rossumd57fd912000-03-10 22:53:23 +00005678 if (!PyArg_ParseTuple(args, "|ss:encode", &encoding, &errors))
5679 return NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00005680 v = PyUnicode_AsEncodedObject((PyObject *)self, encoding, errors);
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00005681 if (v == NULL)
5682 goto onError;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00005683 if (!PyBytes_Check(v)) {
Guido van Rossum4355a472007-05-04 05:00:04 +00005684 if (PyString_Check(v)) {
5685 /* Old codec, turn it into bytes */
5686 PyObject *b = PyBytes_FromObject(v);
5687 Py_DECREF(v);
5688 return b;
5689 }
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00005690 PyErr_Format(PyExc_TypeError,
Guido van Rossumf15a29f2007-05-04 00:41:39 +00005691 "encoder did not return a bytes object "
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00005692 "(type=%.400s)",
5693 v->ob_type->tp_name);
5694 Py_DECREF(v);
5695 return NULL;
5696 }
5697 return v;
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00005698
5699 onError:
5700 return NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00005701}
5702
5703PyDoc_STRVAR(decode__doc__,
5704"S.decode([encoding[,errors]]) -> string or unicode\n\
5705\n\
5706Decodes S using the codec registered for encoding. encoding defaults\n\
5707to the default encoding. errors may be given to set a different error\n\
5708handling scheme. Default is 'strict' meaning that encoding errors raise\n\
5709a UnicodeDecodeError. Other possible values are 'ignore' and 'replace'\n\
5710as well as any other name registerd with codecs.register_error that is\n\
5711able to handle UnicodeDecodeErrors.");
5712
5713static PyObject *
Marc-André Lemburg126b44c2004-07-10 12:04:20 +00005714unicode_decode(PyUnicodeObject *self, PyObject *args)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00005715{
5716 char *encoding = NULL;
5717 char *errors = NULL;
5718 PyObject *v;
5719
5720 if (!PyArg_ParseTuple(args, "|ss:decode", &encoding, &errors))
5721 return NULL;
5722 v = PyUnicode_AsDecodedObject((PyObject *)self, encoding, errors);
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00005723 if (v == NULL)
5724 goto onError;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00005725 if (!PyString_Check(v) && !PyUnicode_Check(v)) {
5726 PyErr_Format(PyExc_TypeError,
5727 "decoder did not return a string/unicode object "
5728 "(type=%.400s)",
5729 v->ob_type->tp_name);
5730 Py_DECREF(v);
5731 return NULL;
5732 }
5733 return v;
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00005734
5735 onError:
5736 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005737}
5738
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005739PyDoc_STRVAR(expandtabs__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005740"S.expandtabs([tabsize]) -> unicode\n\
5741\n\
5742Return a copy of S where all tab characters are expanded using spaces.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005743If tabsize is not given, a tab size of 8 characters is assumed.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005744
5745static PyObject*
5746unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
5747{
5748 Py_UNICODE *e;
5749 Py_UNICODE *p;
5750 Py_UNICODE *q;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005751 Py_ssize_t i, j;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005752 PyUnicodeObject *u;
5753 int tabsize = 8;
5754
5755 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
5756 return NULL;
5757
Thomas Wouters7e474022000-07-16 12:04:32 +00005758 /* First pass: determine size of output string */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005759 i = j = 0;
5760 e = self->str + self->length;
5761 for (p = self->str; p < e; p++)
5762 if (*p == '\t') {
5763 if (tabsize > 0)
5764 j += tabsize - (j % tabsize);
5765 }
5766 else {
5767 j++;
5768 if (*p == '\n' || *p == '\r') {
5769 i += j;
5770 j = 0;
5771 }
5772 }
5773
5774 /* Second pass: create output string and fill it */
5775 u = _PyUnicode_New(i + j);
5776 if (!u)
5777 return NULL;
5778
5779 j = 0;
5780 q = u->str;
5781
5782 for (p = self->str; p < e; p++)
5783 if (*p == '\t') {
5784 if (tabsize > 0) {
5785 i = tabsize - (j % tabsize);
5786 j += i;
5787 while (i--)
5788 *q++ = ' ';
5789 }
5790 }
5791 else {
5792 j++;
5793 *q++ = *p;
5794 if (*p == '\n' || *p == '\r')
5795 j = 0;
5796 }
5797
5798 return (PyObject*) u;
5799}
5800
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005801PyDoc_STRVAR(find__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005802"S.find(sub [,start [,end]]) -> int\n\
5803\n\
5804Return the lowest index in S where substring sub is found,\n\
5805such that sub is contained within s[start,end]. Optional\n\
5806arguments start and end are interpreted as in slice notation.\n\
5807\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005808Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005809
5810static PyObject *
5811unicode_find(PyUnicodeObject *self, PyObject *args)
5812{
Thomas Wouters477c8d52006-05-27 19:21:47 +00005813 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005814 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005815 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005816 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005817
Guido van Rossumb8872e62000-05-09 14:14:27 +00005818 if (!PyArg_ParseTuple(args, "O|O&O&:find", &substring,
5819 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005820 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005821 substring = PyUnicode_FromObject(substring);
5822 if (!substring)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005823 return NULL;
5824
Thomas Wouters477c8d52006-05-27 19:21:47 +00005825 result = stringlib_find_slice(
5826 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
5827 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
5828 start, end
5829 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00005830
5831 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00005832
5833 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005834}
5835
5836static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +00005837unicode_getitem(PyUnicodeObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005838{
5839 if (index < 0 || index >= self->length) {
5840 PyErr_SetString(PyExc_IndexError, "string index out of range");
5841 return NULL;
5842 }
5843
5844 return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1);
5845}
5846
5847static long
Guido van Rossum09dc34f2007-05-04 04:17:33 +00005848unicode_hash(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005849{
Guido van Rossum09dc34f2007-05-04 04:17:33 +00005850 /* Since Unicode objects compare equal to their UTF-8 string
5851 counterparts, we hash the UTF-8 string. */
5852 PyObject *v = _PyUnicode_AsDefaultEncodedString(self, NULL);
5853 return PyObject_Hash(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005854}
5855
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005856PyDoc_STRVAR(index__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005857"S.index(sub [,start [,end]]) -> int\n\
5858\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005859Like S.find() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005860
5861static PyObject *
5862unicode_index(PyUnicodeObject *self, PyObject *args)
5863{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005864 Py_ssize_t result;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005865 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005866 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005867 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005868
Guido van Rossumb8872e62000-05-09 14:14:27 +00005869 if (!PyArg_ParseTuple(args, "O|O&O&:index", &substring,
5870 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005871 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005872 substring = PyUnicode_FromObject(substring);
5873 if (!substring)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005874 return NULL;
5875
Thomas Wouters477c8d52006-05-27 19:21:47 +00005876 result = stringlib_find_slice(
5877 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
5878 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
5879 start, end
5880 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00005881
5882 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00005883
Guido van Rossumd57fd912000-03-10 22:53:23 +00005884 if (result < 0) {
5885 PyErr_SetString(PyExc_ValueError, "substring not found");
5886 return NULL;
5887 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00005888
Martin v. Löwis18e16552006-02-15 17:27:45 +00005889 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005890}
5891
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005892PyDoc_STRVAR(islower__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005893"S.islower() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005894\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00005895Return True if all cased characters in S are lowercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005896at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005897
5898static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005899unicode_islower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005900{
5901 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5902 register const Py_UNICODE *e;
5903 int cased;
5904
Guido van Rossumd57fd912000-03-10 22:53:23 +00005905 /* Shortcut for single character strings */
5906 if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005907 return PyBool_FromLong(Py_UNICODE_ISLOWER(*p));
Guido van Rossumd57fd912000-03-10 22:53:23 +00005908
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005909 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00005910 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005911 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005912
Guido van Rossumd57fd912000-03-10 22:53:23 +00005913 e = p + PyUnicode_GET_SIZE(self);
5914 cased = 0;
5915 for (; p < e; p++) {
5916 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00005917
Guido van Rossumd57fd912000-03-10 22:53:23 +00005918 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005919 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005920 else if (!cased && Py_UNICODE_ISLOWER(ch))
5921 cased = 1;
5922 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00005923 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005924}
5925
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005926PyDoc_STRVAR(isupper__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005927"S.isupper() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005928\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00005929Return True if all cased characters in S are uppercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005930at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005931
5932static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005933unicode_isupper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005934{
5935 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5936 register const Py_UNICODE *e;
5937 int cased;
5938
Guido van Rossumd57fd912000-03-10 22:53:23 +00005939 /* Shortcut for single character strings */
5940 if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005941 return PyBool_FromLong(Py_UNICODE_ISUPPER(*p) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005942
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005943 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00005944 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005945 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005946
Guido van Rossumd57fd912000-03-10 22:53:23 +00005947 e = p + PyUnicode_GET_SIZE(self);
5948 cased = 0;
5949 for (; p < e; p++) {
5950 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00005951
Guido van Rossumd57fd912000-03-10 22:53:23 +00005952 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005953 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005954 else if (!cased && Py_UNICODE_ISUPPER(ch))
5955 cased = 1;
5956 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00005957 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005958}
5959
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005960PyDoc_STRVAR(istitle__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005961"S.istitle() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005962\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00005963Return True if S is a titlecased string and there is at least one\n\
5964character in S, i.e. upper- and titlecase characters may only\n\
5965follow uncased characters and lowercase characters only cased ones.\n\
5966Return False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005967
5968static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005969unicode_istitle(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005970{
5971 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5972 register const Py_UNICODE *e;
5973 int cased, previous_is_cased;
5974
Guido van Rossumd57fd912000-03-10 22:53:23 +00005975 /* Shortcut for single character strings */
5976 if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005977 return PyBool_FromLong((Py_UNICODE_ISTITLE(*p) != 0) ||
5978 (Py_UNICODE_ISUPPER(*p) != 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +00005979
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005980 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00005981 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005982 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005983
Guido van Rossumd57fd912000-03-10 22:53:23 +00005984 e = p + PyUnicode_GET_SIZE(self);
5985 cased = 0;
5986 previous_is_cased = 0;
5987 for (; p < e; p++) {
5988 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00005989
Guido van Rossumd57fd912000-03-10 22:53:23 +00005990 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
5991 if (previous_is_cased)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005992 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005993 previous_is_cased = 1;
5994 cased = 1;
5995 }
5996 else if (Py_UNICODE_ISLOWER(ch)) {
5997 if (!previous_is_cased)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005998 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005999 previous_is_cased = 1;
6000 cased = 1;
6001 }
6002 else
6003 previous_is_cased = 0;
6004 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006005 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006006}
6007
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006008PyDoc_STRVAR(isspace__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006009"S.isspace() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006010\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006011Return True if all characters in S are whitespace\n\
6012and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006013
6014static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006015unicode_isspace(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006016{
6017 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6018 register const Py_UNICODE *e;
6019
Guido van Rossumd57fd912000-03-10 22:53:23 +00006020 /* Shortcut for single character strings */
6021 if (PyUnicode_GET_SIZE(self) == 1 &&
6022 Py_UNICODE_ISSPACE(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006023 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006024
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006025 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006026 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006027 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006028
Guido van Rossumd57fd912000-03-10 22:53:23 +00006029 e = p + PyUnicode_GET_SIZE(self);
6030 for (; p < e; p++) {
6031 if (!Py_UNICODE_ISSPACE(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006032 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006033 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006034 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006035}
6036
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006037PyDoc_STRVAR(isalpha__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006038"S.isalpha() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006039\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006040Return True if all characters in S are alphabetic\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006041and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006042
6043static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006044unicode_isalpha(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006045{
6046 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6047 register const Py_UNICODE *e;
6048
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006049 /* Shortcut for single character strings */
6050 if (PyUnicode_GET_SIZE(self) == 1 &&
6051 Py_UNICODE_ISALPHA(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006052 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006053
6054 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006055 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006056 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006057
6058 e = p + PyUnicode_GET_SIZE(self);
6059 for (; p < e; p++) {
6060 if (!Py_UNICODE_ISALPHA(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006061 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006062 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006063 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006064}
6065
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006066PyDoc_STRVAR(isalnum__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006067"S.isalnum() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006068\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006069Return True if all characters in S are alphanumeric\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006070and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006071
6072static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006073unicode_isalnum(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006074{
6075 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6076 register const Py_UNICODE *e;
6077
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006078 /* Shortcut for single character strings */
6079 if (PyUnicode_GET_SIZE(self) == 1 &&
6080 Py_UNICODE_ISALNUM(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006081 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006082
6083 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006084 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006085 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006086
6087 e = p + PyUnicode_GET_SIZE(self);
6088 for (; p < e; p++) {
6089 if (!Py_UNICODE_ISALNUM(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006090 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006091 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006092 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006093}
6094
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006095PyDoc_STRVAR(isdecimal__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006096"S.isdecimal() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006097\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00006098Return True if there are only decimal characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006099False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006100
6101static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006102unicode_isdecimal(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006103{
6104 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6105 register const Py_UNICODE *e;
6106
Guido van Rossumd57fd912000-03-10 22:53:23 +00006107 /* Shortcut for single character strings */
6108 if (PyUnicode_GET_SIZE(self) == 1 &&
6109 Py_UNICODE_ISDECIMAL(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006110 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006111
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006112 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006113 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006114 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006115
Guido van Rossumd57fd912000-03-10 22:53:23 +00006116 e = p + PyUnicode_GET_SIZE(self);
6117 for (; p < e; p++) {
6118 if (!Py_UNICODE_ISDECIMAL(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006119 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006120 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006121 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006122}
6123
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006124PyDoc_STRVAR(isdigit__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006125"S.isdigit() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006126\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006127Return True if all characters in S are digits\n\
6128and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006129
6130static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006131unicode_isdigit(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006132{
6133 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6134 register const Py_UNICODE *e;
6135
Guido van Rossumd57fd912000-03-10 22:53:23 +00006136 /* Shortcut for single character strings */
6137 if (PyUnicode_GET_SIZE(self) == 1 &&
6138 Py_UNICODE_ISDIGIT(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006139 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006140
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006141 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006142 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006143 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006144
Guido van Rossumd57fd912000-03-10 22:53:23 +00006145 e = p + PyUnicode_GET_SIZE(self);
6146 for (; p < e; p++) {
6147 if (!Py_UNICODE_ISDIGIT(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006148 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006149 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006150 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006151}
6152
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006153PyDoc_STRVAR(isnumeric__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006154"S.isnumeric() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006155\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00006156Return True if there are only numeric characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006157False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006158
6159static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006160unicode_isnumeric(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006161{
6162 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6163 register const Py_UNICODE *e;
6164
Guido van Rossumd57fd912000-03-10 22:53:23 +00006165 /* Shortcut for single character strings */
6166 if (PyUnicode_GET_SIZE(self) == 1 &&
6167 Py_UNICODE_ISNUMERIC(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006168 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006169
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006170 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006171 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006172 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006173
Guido van Rossumd57fd912000-03-10 22:53:23 +00006174 e = p + PyUnicode_GET_SIZE(self);
6175 for (; p < e; p++) {
6176 if (!Py_UNICODE_ISNUMERIC(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006177 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006178 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006179 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006180}
6181
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006182PyDoc_STRVAR(join__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006183"S.join(sequence) -> unicode\n\
6184\n\
6185Return a string which is the concatenation of the strings in the\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006186sequence. The separator between elements is S.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006187
6188static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006189unicode_join(PyObject *self, PyObject *data)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006190{
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006191 return PyUnicode_Join(self, data);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006192}
6193
Martin v. Löwis18e16552006-02-15 17:27:45 +00006194static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +00006195unicode_length(PyUnicodeObject *self)
6196{
6197 return self->length;
6198}
6199
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006200PyDoc_STRVAR(ljust__doc__,
Hye-Shik Chang974ed7c2004-06-02 16:49:17 +00006201"S.ljust(width[, fillchar]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006202\n\
6203Return S left justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006204done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006205
6206static PyObject *
6207unicode_ljust(PyUnicodeObject *self, PyObject *args)
6208{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006209 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006210 Py_UNICODE fillchar = ' ';
6211
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006212 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006213 return NULL;
6214
Tim Peters7a29bd52001-09-12 03:03:31 +00006215 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006216 Py_INCREF(self);
6217 return (PyObject*) self;
6218 }
6219
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006220 return (PyObject*) pad(self, 0, width - self->length, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006221}
6222
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006223PyDoc_STRVAR(lower__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006224"S.lower() -> unicode\n\
6225\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006226Return a copy of the string S converted to lowercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006227
6228static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006229unicode_lower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006230{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006231 return fixup(self, fixlower);
6232}
6233
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006234#define LEFTSTRIP 0
6235#define RIGHTSTRIP 1
6236#define BOTHSTRIP 2
6237
6238/* Arrays indexed by above */
6239static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
6240
6241#define STRIPNAME(i) (stripformat[i]+3)
6242
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006243/* externally visible for str.strip(unicode) */
6244PyObject *
6245_PyUnicode_XStrip(PyUnicodeObject *self, int striptype, PyObject *sepobj)
6246{
6247 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
Martin v. Löwis18e16552006-02-15 17:27:45 +00006248 Py_ssize_t len = PyUnicode_GET_SIZE(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006249 Py_UNICODE *sep = PyUnicode_AS_UNICODE(sepobj);
Martin v. Löwis18e16552006-02-15 17:27:45 +00006250 Py_ssize_t seplen = PyUnicode_GET_SIZE(sepobj);
6251 Py_ssize_t i, j;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006252
Thomas Wouters477c8d52006-05-27 19:21:47 +00006253 BLOOM_MASK sepmask = make_bloom_mask(sep, seplen);
6254
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006255 i = 0;
6256 if (striptype != RIGHTSTRIP) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00006257 while (i < len && BLOOM_MEMBER(sepmask, s[i], sep, seplen)) {
6258 i++;
6259 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006260 }
6261
6262 j = len;
6263 if (striptype != LEFTSTRIP) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00006264 do {
6265 j--;
6266 } while (j >= i && BLOOM_MEMBER(sepmask, s[j], sep, seplen));
6267 j++;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006268 }
6269
6270 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00006271 Py_INCREF(self);
6272 return (PyObject*)self;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006273 }
6274 else
Thomas Wouters477c8d52006-05-27 19:21:47 +00006275 return PyUnicode_FromUnicode(s+i, j-i);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006276}
6277
Guido van Rossumd57fd912000-03-10 22:53:23 +00006278
6279static PyObject *
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006280do_strip(PyUnicodeObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006281{
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006282 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
Martin v. Löwis18e16552006-02-15 17:27:45 +00006283 Py_ssize_t len = PyUnicode_GET_SIZE(self), i, j;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006284
6285 i = 0;
6286 if (striptype != RIGHTSTRIP) {
6287 while (i < len && Py_UNICODE_ISSPACE(s[i])) {
6288 i++;
6289 }
6290 }
6291
6292 j = len;
6293 if (striptype != LEFTSTRIP) {
6294 do {
6295 j--;
6296 } while (j >= i && Py_UNICODE_ISSPACE(s[j]));
6297 j++;
6298 }
6299
6300 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
6301 Py_INCREF(self);
6302 return (PyObject*)self;
6303 }
6304 else
6305 return PyUnicode_FromUnicode(s+i, j-i);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006306}
6307
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006308
6309static PyObject *
6310do_argstrip(PyUnicodeObject *self, int striptype, PyObject *args)
6311{
6312 PyObject *sep = NULL;
6313
6314 if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
6315 return NULL;
6316
6317 if (sep != NULL && sep != Py_None) {
6318 if (PyUnicode_Check(sep))
6319 return _PyUnicode_XStrip(self, striptype, sep);
6320 else if (PyString_Check(sep)) {
6321 PyObject *res;
6322 sep = PyUnicode_FromObject(sep);
6323 if (sep==NULL)
6324 return NULL;
6325 res = _PyUnicode_XStrip(self, striptype, sep);
6326 Py_DECREF(sep);
6327 return res;
6328 }
6329 else {
6330 PyErr_Format(PyExc_TypeError,
6331 "%s arg must be None, unicode or str",
6332 STRIPNAME(striptype));
6333 return NULL;
6334 }
6335 }
6336
6337 return do_strip(self, striptype);
6338}
6339
6340
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006341PyDoc_STRVAR(strip__doc__,
Neal Norwitzffe33b72003-04-10 22:35:32 +00006342"S.strip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006343\n\
6344Return a copy of the string S with leading and trailing\n\
6345whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00006346If chars is given and not None, remove characters in chars instead.\n\
6347If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006348
6349static PyObject *
6350unicode_strip(PyUnicodeObject *self, PyObject *args)
6351{
6352 if (PyTuple_GET_SIZE(args) == 0)
6353 return do_strip(self, BOTHSTRIP); /* Common case */
6354 else
6355 return do_argstrip(self, BOTHSTRIP, args);
6356}
6357
6358
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006359PyDoc_STRVAR(lstrip__doc__,
Neal Norwitzffe33b72003-04-10 22:35:32 +00006360"S.lstrip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006361\n\
6362Return a copy of the string S with leading whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00006363If chars is given and not None, remove characters in chars instead.\n\
6364If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006365
6366static PyObject *
6367unicode_lstrip(PyUnicodeObject *self, PyObject *args)
6368{
6369 if (PyTuple_GET_SIZE(args) == 0)
6370 return do_strip(self, LEFTSTRIP); /* Common case */
6371 else
6372 return do_argstrip(self, LEFTSTRIP, args);
6373}
6374
6375
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006376PyDoc_STRVAR(rstrip__doc__,
Neal Norwitzffe33b72003-04-10 22:35:32 +00006377"S.rstrip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006378\n\
6379Return a copy of the string S with trailing whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00006380If chars is given and not None, remove characters in chars instead.\n\
6381If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006382
6383static PyObject *
6384unicode_rstrip(PyUnicodeObject *self, PyObject *args)
6385{
6386 if (PyTuple_GET_SIZE(args) == 0)
6387 return do_strip(self, RIGHTSTRIP); /* Common case */
6388 else
6389 return do_argstrip(self, RIGHTSTRIP, args);
6390}
6391
6392
Guido van Rossumd57fd912000-03-10 22:53:23 +00006393static PyObject*
Martin v. Löwis18e16552006-02-15 17:27:45 +00006394unicode_repeat(PyUnicodeObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006395{
6396 PyUnicodeObject *u;
6397 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006398 Py_ssize_t nchars;
Tim Peters8f422462000-09-09 06:13:41 +00006399 size_t nbytes;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006400
6401 if (len < 0)
6402 len = 0;
6403
Tim Peters7a29bd52001-09-12 03:03:31 +00006404 if (len == 1 && PyUnicode_CheckExact(str)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006405 /* no repeat, return original string */
6406 Py_INCREF(str);
6407 return (PyObject*) str;
6408 }
Tim Peters8f422462000-09-09 06:13:41 +00006409
6410 /* ensure # of chars needed doesn't overflow int and # of bytes
6411 * needed doesn't overflow size_t
6412 */
6413 nchars = len * str->length;
6414 if (len && nchars / len != str->length) {
6415 PyErr_SetString(PyExc_OverflowError,
6416 "repeated string is too long");
6417 return NULL;
6418 }
6419 nbytes = (nchars + 1) * sizeof(Py_UNICODE);
6420 if (nbytes / sizeof(Py_UNICODE) != (size_t)(nchars + 1)) {
6421 PyErr_SetString(PyExc_OverflowError,
6422 "repeated string is too long");
6423 return NULL;
6424 }
6425 u = _PyUnicode_New(nchars);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006426 if (!u)
6427 return NULL;
6428
6429 p = u->str;
6430
Thomas Wouters477c8d52006-05-27 19:21:47 +00006431 if (str->length == 1 && len > 0) {
6432 Py_UNICODE_FILL(p, str->str[0], len);
6433 } else {
6434 Py_ssize_t done = 0; /* number of characters copied this far */
6435 if (done < nchars) {
6436 Py_UNICODE_COPY(p, str->str, str->length);
6437 done = str->length;
6438 }
6439 while (done < nchars) {
6440 int n = (done <= nchars-done) ? done : nchars-done;
6441 Py_UNICODE_COPY(p+done, p, n);
6442 done += n;
6443 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006444 }
6445
6446 return (PyObject*) u;
6447}
6448
6449PyObject *PyUnicode_Replace(PyObject *obj,
6450 PyObject *subobj,
6451 PyObject *replobj,
Martin v. Löwis18e16552006-02-15 17:27:45 +00006452 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006453{
6454 PyObject *self;
6455 PyObject *str1;
6456 PyObject *str2;
6457 PyObject *result;
6458
6459 self = PyUnicode_FromObject(obj);
6460 if (self == NULL)
6461 return NULL;
6462 str1 = PyUnicode_FromObject(subobj);
6463 if (str1 == NULL) {
6464 Py_DECREF(self);
6465 return NULL;
6466 }
6467 str2 = PyUnicode_FromObject(replobj);
6468 if (str2 == NULL) {
6469 Py_DECREF(self);
6470 Py_DECREF(str1);
6471 return NULL;
6472 }
Tim Petersced69f82003-09-16 20:30:58 +00006473 result = replace((PyUnicodeObject *)self,
6474 (PyUnicodeObject *)str1,
6475 (PyUnicodeObject *)str2,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006476 maxcount);
6477 Py_DECREF(self);
6478 Py_DECREF(str1);
6479 Py_DECREF(str2);
6480 return result;
6481}
6482
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006483PyDoc_STRVAR(replace__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006484"S.replace (old, new[, maxsplit]) -> unicode\n\
6485\n\
6486Return a copy of S with all occurrences of substring\n\
6487old replaced by new. If the optional argument maxsplit is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006488given, only the first maxsplit occurrences are replaced.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006489
6490static PyObject*
6491unicode_replace(PyUnicodeObject *self, PyObject *args)
6492{
6493 PyUnicodeObject *str1;
6494 PyUnicodeObject *str2;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006495 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006496 PyObject *result;
6497
Martin v. Löwis18e16552006-02-15 17:27:45 +00006498 if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006499 return NULL;
6500 str1 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str1);
6501 if (str1 == NULL)
6502 return NULL;
6503 str2 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str2);
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00006504 if (str2 == NULL) {
6505 Py_DECREF(str1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006506 return NULL;
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00006507 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006508
6509 result = replace(self, str1, str2, maxcount);
6510
6511 Py_DECREF(str1);
6512 Py_DECREF(str2);
6513 return result;
6514}
6515
6516static
6517PyObject *unicode_repr(PyObject *unicode)
6518{
Walter Dörwald79e913e2007-05-12 11:08:06 +00006519 PyObject *repr;
6520 char *p;
6521 Py_UNICODE *s = PyUnicode_AS_UNICODE(unicode);
6522 Py_ssize_t size = PyUnicode_GET_SIZE(unicode);
6523
6524 /* XXX(nnorwitz): rather than over-allocating, it would be
6525 better to choose a different scheme. Perhaps scan the
6526 first N-chars of the string and allocate based on that size.
6527 */
6528 /* Initial allocation is based on the longest-possible unichr
6529 escape.
6530
6531 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
6532 unichr, so in this case it's the longest unichr escape. In
6533 narrow (UTF-16) builds this is five chars per source unichr
6534 since there are two unichrs in the surrogate pair, so in narrow
6535 (UTF-16) builds it's not the longest unichr escape.
6536
6537 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
6538 so in the narrow (UTF-16) build case it's the longest unichr
6539 escape.
6540 */
6541
6542 repr = PyString_FromStringAndSize(NULL,
6543 2 /* quotes */
6544#ifdef Py_UNICODE_WIDE
6545 + 10*size
6546#else
6547 + 6*size
6548#endif
6549 + 1);
6550 if (repr == NULL)
6551 return NULL;
6552
6553 p = PyString_AS_STRING(repr);
6554
6555 /* Add quote */
6556 *p++ = (findchar(s, size, '\'') &&
6557 !findchar(s, size, '"')) ? '"' : '\'';
6558 while (size-- > 0) {
6559 Py_UNICODE ch = *s++;
6560
6561 /* Escape quotes and backslashes */
6562 if ((ch == (Py_UNICODE) PyString_AS_STRING(repr)[0]) || (ch == '\\')) {
6563 *p++ = '\\';
6564 *p++ = (char) ch;
6565 continue;
6566 }
6567
6568#ifdef Py_UNICODE_WIDE
6569 /* Map 21-bit characters to '\U00xxxxxx' */
6570 else if (ch >= 0x10000) {
6571 *p++ = '\\';
6572 *p++ = 'U';
6573 *p++ = hexdigits[(ch >> 28) & 0x0000000F];
6574 *p++ = hexdigits[(ch >> 24) & 0x0000000F];
6575 *p++ = hexdigits[(ch >> 20) & 0x0000000F];
6576 *p++ = hexdigits[(ch >> 16) & 0x0000000F];
6577 *p++ = hexdigits[(ch >> 12) & 0x0000000F];
6578 *p++ = hexdigits[(ch >> 8) & 0x0000000F];
6579 *p++ = hexdigits[(ch >> 4) & 0x0000000F];
6580 *p++ = hexdigits[ch & 0x0000000F];
6581 continue;
6582 }
6583#else
6584 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
6585 else if (ch >= 0xD800 && ch < 0xDC00) {
6586 Py_UNICODE ch2;
6587 Py_UCS4 ucs;
6588
6589 ch2 = *s++;
6590 size--;
6591 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
6592 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
6593 *p++ = '\\';
6594 *p++ = 'U';
6595 *p++ = hexdigits[(ucs >> 28) & 0x0000000F];
6596 *p++ = hexdigits[(ucs >> 24) & 0x0000000F];
6597 *p++ = hexdigits[(ucs >> 20) & 0x0000000F];
6598 *p++ = hexdigits[(ucs >> 16) & 0x0000000F];
6599 *p++ = hexdigits[(ucs >> 12) & 0x0000000F];
6600 *p++ = hexdigits[(ucs >> 8) & 0x0000000F];
6601 *p++ = hexdigits[(ucs >> 4) & 0x0000000F];
6602 *p++ = hexdigits[ucs & 0x0000000F];
6603 continue;
6604 }
6605 /* Fall through: isolated surrogates are copied as-is */
6606 s--;
6607 size++;
6608 }
6609#endif
6610
6611 /* Map 16-bit characters to '\uxxxx' */
6612 if (ch >= 256) {
6613 *p++ = '\\';
6614 *p++ = 'u';
6615 *p++ = hexdigits[(ch >> 12) & 0x000F];
6616 *p++ = hexdigits[(ch >> 8) & 0x000F];
6617 *p++ = hexdigits[(ch >> 4) & 0x000F];
6618 *p++ = hexdigits[ch & 0x000F];
6619 }
6620
6621 /* Map special whitespace to '\t', \n', '\r' */
6622 else if (ch == '\t') {
6623 *p++ = '\\';
6624 *p++ = 't';
6625 }
6626 else if (ch == '\n') {
6627 *p++ = '\\';
6628 *p++ = 'n';
6629 }
6630 else if (ch == '\r') {
6631 *p++ = '\\';
6632 *p++ = 'r';
6633 }
6634
6635 /* Map non-printable US ASCII to '\xhh' */
6636 else if (ch < ' ' || ch >= 0x7F) {
6637 *p++ = '\\';
6638 *p++ = 'x';
6639 *p++ = hexdigits[(ch >> 4) & 0x000F];
6640 *p++ = hexdigits[ch & 0x000F];
6641 }
6642
6643 /* Copy everything else as-is */
6644 else
6645 *p++ = (char) ch;
6646 }
6647 /* Add quote */
6648 *p++ = PyString_AS_STRING(repr)[0];
6649
6650 *p = '\0';
6651 _PyString_Resize(&repr, p - PyString_AS_STRING(repr));
6652 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006653}
6654
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006655PyDoc_STRVAR(rfind__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006656"S.rfind(sub [,start [,end]]) -> int\n\
6657\n\
6658Return the highest index in S where substring sub is found,\n\
6659such that sub is contained within s[start,end]. Optional\n\
6660arguments start and end are interpreted as in slice notation.\n\
6661\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006662Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006663
6664static PyObject *
6665unicode_rfind(PyUnicodeObject *self, PyObject *args)
6666{
Thomas Wouters477c8d52006-05-27 19:21:47 +00006667 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006668 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006669 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006670 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006671
Guido van Rossumb8872e62000-05-09 14:14:27 +00006672 if (!PyArg_ParseTuple(args, "O|O&O&:rfind", &substring,
6673 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006674 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006675 substring = PyUnicode_FromObject(substring);
6676 if (!substring)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006677 return NULL;
6678
Thomas Wouters477c8d52006-05-27 19:21:47 +00006679 result = stringlib_rfind_slice(
6680 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
6681 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
6682 start, end
6683 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006684
6685 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00006686
6687 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006688}
6689
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006690PyDoc_STRVAR(rindex__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006691"S.rindex(sub [,start [,end]]) -> int\n\
6692\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006693Like S.rfind() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006694
6695static PyObject *
6696unicode_rindex(PyUnicodeObject *self, PyObject *args)
6697{
Thomas Wouters477c8d52006-05-27 19:21:47 +00006698 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006699 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006700 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006701 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006702
Guido van Rossumb8872e62000-05-09 14:14:27 +00006703 if (!PyArg_ParseTuple(args, "O|O&O&:rindex", &substring,
6704 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006705 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006706 substring = PyUnicode_FromObject(substring);
6707 if (!substring)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006708 return NULL;
6709
Thomas Wouters477c8d52006-05-27 19:21:47 +00006710 result = stringlib_rfind_slice(
6711 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
6712 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
6713 start, end
6714 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006715
6716 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00006717
Guido van Rossumd57fd912000-03-10 22:53:23 +00006718 if (result < 0) {
6719 PyErr_SetString(PyExc_ValueError, "substring not found");
6720 return NULL;
6721 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00006722 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006723}
6724
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006725PyDoc_STRVAR(rjust__doc__,
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006726"S.rjust(width[, fillchar]) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006727\n\
6728Return S right justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006729done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006730
6731static PyObject *
6732unicode_rjust(PyUnicodeObject *self, PyObject *args)
6733{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006734 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006735 Py_UNICODE fillchar = ' ';
6736
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006737 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006738 return NULL;
6739
Tim Peters7a29bd52001-09-12 03:03:31 +00006740 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006741 Py_INCREF(self);
6742 return (PyObject*) self;
6743 }
6744
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006745 return (PyObject*) pad(self, width - self->length, 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006746}
6747
Guido van Rossumd57fd912000-03-10 22:53:23 +00006748static PyObject*
Martin v. Löwis18e16552006-02-15 17:27:45 +00006749unicode_slice(PyUnicodeObject *self, Py_ssize_t start, Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006750{
6751 /* standard clamping */
6752 if (start < 0)
6753 start = 0;
6754 if (end < 0)
6755 end = 0;
6756 if (end > self->length)
6757 end = self->length;
Tim Peters7a29bd52001-09-12 03:03:31 +00006758 if (start == 0 && end == self->length && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006759 /* full slice, return original string */
6760 Py_INCREF(self);
6761 return (PyObject*) self;
6762 }
6763 if (start > end)
6764 start = end;
6765 /* copy slice */
6766 return (PyObject*) PyUnicode_FromUnicode(self->str + start,
6767 end - start);
6768}
6769
6770PyObject *PyUnicode_Split(PyObject *s,
6771 PyObject *sep,
Martin v. Löwis18e16552006-02-15 17:27:45 +00006772 Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006773{
6774 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00006775
Guido van Rossumd57fd912000-03-10 22:53:23 +00006776 s = PyUnicode_FromObject(s);
6777 if (s == NULL)
6778 return NULL;
6779 if (sep != NULL) {
6780 sep = PyUnicode_FromObject(sep);
6781 if (sep == NULL) {
6782 Py_DECREF(s);
6783 return NULL;
6784 }
6785 }
6786
6787 result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
6788
6789 Py_DECREF(s);
6790 Py_XDECREF(sep);
6791 return result;
6792}
6793
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006794PyDoc_STRVAR(split__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006795"S.split([sep [,maxsplit]]) -> list of strings\n\
6796\n\
6797Return a list of the words in S, using sep as the\n\
6798delimiter string. If maxsplit is given, at most maxsplit\n\
Thomas Hellerca0d2cb2004-09-15 11:41:32 +00006799splits are done. If sep is not specified or is None,\n\
Walter Dörwald782afc52004-09-14 09:40:45 +00006800any whitespace string is a separator.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006801
6802static PyObject*
6803unicode_split(PyUnicodeObject *self, PyObject *args)
6804{
6805 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006806 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006807
Martin v. Löwis18e16552006-02-15 17:27:45 +00006808 if (!PyArg_ParseTuple(args, "|On:split", &substring, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006809 return NULL;
6810
6811 if (substring == Py_None)
6812 return split(self, NULL, maxcount);
6813 else if (PyUnicode_Check(substring))
6814 return split(self, (PyUnicodeObject *)substring, maxcount);
6815 else
6816 return PyUnicode_Split((PyObject *)self, substring, maxcount);
6817}
6818
Thomas Wouters477c8d52006-05-27 19:21:47 +00006819PyObject *
6820PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
6821{
6822 PyObject* str_obj;
6823 PyObject* sep_obj;
6824 PyObject* out;
6825
6826 str_obj = PyUnicode_FromObject(str_in);
6827 if (!str_obj)
6828 return NULL;
6829 sep_obj = PyUnicode_FromObject(sep_in);
6830 if (!sep_obj) {
6831 Py_DECREF(str_obj);
6832 return NULL;
6833 }
6834
6835 out = stringlib_partition(
6836 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
6837 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
6838 );
6839
6840 Py_DECREF(sep_obj);
6841 Py_DECREF(str_obj);
6842
6843 return out;
6844}
6845
6846
6847PyObject *
6848PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
6849{
6850 PyObject* str_obj;
6851 PyObject* sep_obj;
6852 PyObject* out;
6853
6854 str_obj = PyUnicode_FromObject(str_in);
6855 if (!str_obj)
6856 return NULL;
6857 sep_obj = PyUnicode_FromObject(sep_in);
6858 if (!sep_obj) {
6859 Py_DECREF(str_obj);
6860 return NULL;
6861 }
6862
6863 out = stringlib_rpartition(
6864 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
6865 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
6866 );
6867
6868 Py_DECREF(sep_obj);
6869 Py_DECREF(str_obj);
6870
6871 return out;
6872}
6873
6874PyDoc_STRVAR(partition__doc__,
6875"S.partition(sep) -> (head, sep, tail)\n\
6876\n\
6877Searches for the separator sep in S, and returns the part before it,\n\
6878the separator itself, and the part after it. If the separator is not\n\
6879found, returns S and two empty strings.");
6880
6881static PyObject*
6882unicode_partition(PyUnicodeObject *self, PyObject *separator)
6883{
6884 return PyUnicode_Partition((PyObject *)self, separator);
6885}
6886
6887PyDoc_STRVAR(rpartition__doc__,
Thomas Wouters89f507f2006-12-13 04:49:30 +00006888"S.rpartition(sep) -> (tail, sep, head)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00006889\n\
6890Searches for the separator sep in S, starting at the end of S, and returns\n\
6891the part before it, the separator itself, and the part after it. If the\n\
Thomas Wouters89f507f2006-12-13 04:49:30 +00006892separator is not found, returns two empty strings and S.");
Thomas Wouters477c8d52006-05-27 19:21:47 +00006893
6894static PyObject*
6895unicode_rpartition(PyUnicodeObject *self, PyObject *separator)
6896{
6897 return PyUnicode_RPartition((PyObject *)self, separator);
6898}
6899
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006900PyObject *PyUnicode_RSplit(PyObject *s,
6901 PyObject *sep,
Martin v. Löwis18e16552006-02-15 17:27:45 +00006902 Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006903{
6904 PyObject *result;
6905
6906 s = PyUnicode_FromObject(s);
6907 if (s == NULL)
6908 return NULL;
6909 if (sep != NULL) {
6910 sep = PyUnicode_FromObject(sep);
6911 if (sep == NULL) {
6912 Py_DECREF(s);
6913 return NULL;
6914 }
6915 }
6916
6917 result = rsplit((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
6918
6919 Py_DECREF(s);
6920 Py_XDECREF(sep);
6921 return result;
6922}
6923
6924PyDoc_STRVAR(rsplit__doc__,
6925"S.rsplit([sep [,maxsplit]]) -> list of strings\n\
6926\n\
6927Return a list of the words in S, using sep as the\n\
6928delimiter string, starting at the end of the string and\n\
6929working to the front. If maxsplit is given, at most maxsplit\n\
6930splits are done. If sep is not specified, any whitespace string\n\
6931is a separator.");
6932
6933static PyObject*
6934unicode_rsplit(PyUnicodeObject *self, PyObject *args)
6935{
6936 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006937 Py_ssize_t maxcount = -1;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006938
Martin v. Löwis18e16552006-02-15 17:27:45 +00006939 if (!PyArg_ParseTuple(args, "|On:rsplit", &substring, &maxcount))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006940 return NULL;
6941
6942 if (substring == Py_None)
6943 return rsplit(self, NULL, maxcount);
6944 else if (PyUnicode_Check(substring))
6945 return rsplit(self, (PyUnicodeObject *)substring, maxcount);
6946 else
6947 return PyUnicode_RSplit((PyObject *)self, substring, maxcount);
6948}
6949
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006950PyDoc_STRVAR(splitlines__doc__,
Guido van Rossum86662912000-04-11 15:38:46 +00006951"S.splitlines([keepends]]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006952\n\
6953Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +00006954Line breaks are not included in the resulting list unless keepends\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006955is given and true.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006956
6957static PyObject*
6958unicode_splitlines(PyUnicodeObject *self, PyObject *args)
6959{
Guido van Rossum86662912000-04-11 15:38:46 +00006960 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006961
Guido van Rossum86662912000-04-11 15:38:46 +00006962 if (!PyArg_ParseTuple(args, "|i:splitlines", &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006963 return NULL;
6964
Guido van Rossum86662912000-04-11 15:38:46 +00006965 return PyUnicode_Splitlines((PyObject *)self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006966}
6967
6968static
Guido van Rossumf15a29f2007-05-04 00:41:39 +00006969PyObject *unicode_str(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006970{
Guido van Rossumf15a29f2007-05-04 00:41:39 +00006971 PyObject *res = _PyUnicode_AsDefaultEncodedString(self, NULL);
6972 Py_XINCREF(res);
6973 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006974}
6975
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006976PyDoc_STRVAR(swapcase__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006977"S.swapcase() -> unicode\n\
6978\n\
6979Return a copy of S with uppercase characters converted to lowercase\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006980and vice versa.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006981
6982static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006983unicode_swapcase(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006984{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006985 return fixup(self, fixswapcase);
6986}
6987
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006988PyDoc_STRVAR(translate__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006989"S.translate(table) -> unicode\n\
6990\n\
6991Return a copy of the string S, where all characters have been mapped\n\
6992through the given translation table, which must be a mapping of\n\
Walter Dörwald5c1ee172002-09-04 20:31:32 +00006993Unicode ordinals to Unicode ordinals, Unicode strings or None.\n\
6994Unmapped characters are left untouched. Characters mapped to None\n\
6995are deleted.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006996
6997static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006998unicode_translate(PyUnicodeObject *self, PyObject *table)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006999{
Tim Petersced69f82003-09-16 20:30:58 +00007000 return PyUnicode_TranslateCharmap(self->str,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007001 self->length,
Tim Petersced69f82003-09-16 20:30:58 +00007002 table,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007003 "ignore");
7004}
7005
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007006PyDoc_STRVAR(upper__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007007"S.upper() -> unicode\n\
7008\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007009Return a copy of S converted to uppercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007010
7011static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007012unicode_upper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007013{
Guido van Rossumd57fd912000-03-10 22:53:23 +00007014 return fixup(self, fixupper);
7015}
7016
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007017PyDoc_STRVAR(zfill__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007018"S.zfill(width) -> unicode\n\
7019\n\
7020Pad a numeric string x with zeros on the left, to fill a field\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007021of the specified width. The string x is never truncated.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007022
7023static PyObject *
7024unicode_zfill(PyUnicodeObject *self, PyObject *args)
7025{
Martin v. Löwis18e16552006-02-15 17:27:45 +00007026 Py_ssize_t fill;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007027 PyUnicodeObject *u;
7028
Martin v. Löwis18e16552006-02-15 17:27:45 +00007029 Py_ssize_t width;
7030 if (!PyArg_ParseTuple(args, "n:zfill", &width))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007031 return NULL;
7032
7033 if (self->length >= width) {
Walter Dörwald0fe940c2002-04-15 18:42:15 +00007034 if (PyUnicode_CheckExact(self)) {
7035 Py_INCREF(self);
7036 return (PyObject*) self;
7037 }
7038 else
7039 return PyUnicode_FromUnicode(
7040 PyUnicode_AS_UNICODE(self),
7041 PyUnicode_GET_SIZE(self)
7042 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007043 }
7044
7045 fill = width - self->length;
7046
7047 u = pad(self, fill, 0, '0');
7048
Walter Dörwald068325e2002-04-15 13:36:47 +00007049 if (u == NULL)
7050 return NULL;
7051
Guido van Rossumd57fd912000-03-10 22:53:23 +00007052 if (u->str[fill] == '+' || u->str[fill] == '-') {
7053 /* move sign to beginning of string */
7054 u->str[0] = u->str[fill];
7055 u->str[fill] = '0';
7056 }
7057
7058 return (PyObject*) u;
7059}
Guido van Rossumd57fd912000-03-10 22:53:23 +00007060
7061#if 0
7062static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007063unicode_freelistsize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007064{
Guido van Rossumd57fd912000-03-10 22:53:23 +00007065 return PyInt_FromLong(unicode_freelist_size);
7066}
7067#endif
7068
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007069PyDoc_STRVAR(startswith__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00007070"S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007071\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00007072Return True if S starts with the specified prefix, False otherwise.\n\
7073With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007074With optional end, stop comparing S at that position.\n\
7075prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007076
7077static PyObject *
7078unicode_startswith(PyUnicodeObject *self,
7079 PyObject *args)
7080{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007081 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007082 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007083 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007084 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007085 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007086
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007087 if (!PyArg_ParseTuple(args, "O|O&O&:startswith", &subobj,
Guido van Rossumb8872e62000-05-09 14:14:27 +00007088 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007089 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007090 if (PyTuple_Check(subobj)) {
7091 Py_ssize_t i;
7092 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
7093 substring = (PyUnicodeObject *)PyUnicode_FromObject(
7094 PyTuple_GET_ITEM(subobj, i));
7095 if (substring == NULL)
7096 return NULL;
7097 result = tailmatch(self, substring, start, end, -1);
7098 Py_DECREF(substring);
7099 if (result) {
7100 Py_RETURN_TRUE;
7101 }
7102 }
7103 /* nothing matched */
7104 Py_RETURN_FALSE;
7105 }
7106 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007107 if (substring == NULL)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007108 return NULL;
7109 result = tailmatch(self, substring, start, end, -1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007110 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007111 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007112}
7113
7114
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007115PyDoc_STRVAR(endswith__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00007116"S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007117\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00007118Return True if S ends with the specified suffix, False otherwise.\n\
7119With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007120With optional end, stop comparing S at that position.\n\
7121suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007122
7123static PyObject *
7124unicode_endswith(PyUnicodeObject *self,
7125 PyObject *args)
7126{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007127 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007128 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007129 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007130 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007131 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007132
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007133 if (!PyArg_ParseTuple(args, "O|O&O&:endswith", &subobj,
7134 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007135 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007136 if (PyTuple_Check(subobj)) {
7137 Py_ssize_t i;
7138 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
7139 substring = (PyUnicodeObject *)PyUnicode_FromObject(
7140 PyTuple_GET_ITEM(subobj, i));
7141 if (substring == NULL)
7142 return NULL;
7143 result = tailmatch(self, substring, start, end, +1);
7144 Py_DECREF(substring);
7145 if (result) {
7146 Py_RETURN_TRUE;
7147 }
7148 }
7149 Py_RETURN_FALSE;
7150 }
7151 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007152 if (substring == NULL)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007153 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007154
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007155 result = tailmatch(self, substring, start, end, +1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007156 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007157 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007158}
7159
7160
Guido van Rossum5d9113d2003-01-29 17:58:45 +00007161
7162static PyObject *
7163unicode_getnewargs(PyUnicodeObject *v)
7164{
7165 return Py_BuildValue("(u#)", v->str, v->length);
7166}
7167
7168
Guido van Rossumd57fd912000-03-10 22:53:23 +00007169static PyMethodDef unicode_methods[] = {
7170
7171 /* Order is according to common usage: often used methods should
7172 appear first, since lookup is done sequentially. */
7173
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007174 {"encode", (PyCFunction) unicode_encode, METH_VARARGS, encode__doc__},
7175 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
7176 {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007177 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007178 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
7179 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
7180 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
7181 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
7182 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
7183 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
7184 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +00007185 {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007186 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
7187 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
7188 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007189 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00007190 {"decode", (PyCFunction) unicode_decode, METH_VARARGS, decode__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007191/* {"maketrans", (PyCFunction) unicode_maketrans, METH_VARARGS, maketrans__doc__}, */
7192 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
7193 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
7194 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007195 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +00007196 {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007197 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS, splitlines__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007198 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007199 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
7200 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
7201 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
7202 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
7203 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
7204 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
7205 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
7206 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
7207 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
7208 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
7209 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
7210 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
7211 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
7212 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007213 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
Walter Dörwald068325e2002-04-15 13:36:47 +00007214#if 0
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007215 {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
Guido van Rossumd57fd912000-03-10 22:53:23 +00007216#endif
7217
7218#if 0
7219 /* This one is just used for debugging the implementation. */
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007220 {"freelistsize", (PyCFunction) unicode_freelistsize, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00007221#endif
7222
Guido van Rossum5d9113d2003-01-29 17:58:45 +00007223 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00007224 {NULL, NULL}
7225};
7226
Neil Schemenauerce30bc92002-11-18 16:10:18 +00007227static PyObject *
7228unicode_mod(PyObject *v, PyObject *w)
7229{
7230 if (!PyUnicode_Check(v)) {
7231 Py_INCREF(Py_NotImplemented);
7232 return Py_NotImplemented;
7233 }
7234 return PyUnicode_Format(v, w);
7235}
7236
7237static PyNumberMethods unicode_as_number = {
7238 0, /*nb_add*/
7239 0, /*nb_subtract*/
7240 0, /*nb_multiply*/
Neil Schemenauerce30bc92002-11-18 16:10:18 +00007241 unicode_mod, /*nb_remainder*/
7242};
7243
Guido van Rossumd57fd912000-03-10 22:53:23 +00007244static PySequenceMethods unicode_as_sequence = {
Martin v. Löwis18e16552006-02-15 17:27:45 +00007245 (lenfunc) unicode_length, /* sq_length */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007246 PyUnicode_Concat, /* sq_concat */
Martin v. Löwis18e16552006-02-15 17:27:45 +00007247 (ssizeargfunc) unicode_repeat, /* sq_repeat */
7248 (ssizeargfunc) unicode_getitem, /* sq_item */
7249 (ssizessizeargfunc) unicode_slice, /* sq_slice */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007250 0, /* sq_ass_item */
7251 0, /* sq_ass_slice */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007252 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007253};
7254
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007255static PyObject*
7256unicode_subscript(PyUnicodeObject* self, PyObject* item)
7257{
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00007258 if (PyIndex_Check(item)) {
7259 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007260 if (i == -1 && PyErr_Occurred())
7261 return NULL;
7262 if (i < 0)
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007263 i += PyUnicode_GET_SIZE(self);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007264 return unicode_getitem(self, i);
7265 } else if (PySlice_Check(item)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00007266 Py_ssize_t start, stop, step, slicelength, cur, i;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007267 Py_UNICODE* source_buf;
7268 Py_UNICODE* result_buf;
7269 PyObject* result;
7270
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007271 if (PySlice_GetIndicesEx((PySliceObject*)item, PyUnicode_GET_SIZE(self),
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007272 &start, &stop, &step, &slicelength) < 0) {
7273 return NULL;
7274 }
7275
7276 if (slicelength <= 0) {
7277 return PyUnicode_FromUnicode(NULL, 0);
7278 } else {
7279 source_buf = PyUnicode_AS_UNICODE((PyObject*)self);
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007280 result_buf = (Py_UNICODE *)PyMem_MALLOC(slicelength*
7281 sizeof(Py_UNICODE));
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007282
7283 if (result_buf == NULL)
7284 return PyErr_NoMemory();
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007285
7286 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
7287 result_buf[i] = source_buf[cur];
7288 }
Tim Petersced69f82003-09-16 20:30:58 +00007289
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007290 result = PyUnicode_FromUnicode(result_buf, slicelength);
7291 PyMem_FREE(result_buf);
7292 return result;
7293 }
7294 } else {
7295 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
7296 return NULL;
7297 }
7298}
7299
7300static PyMappingMethods unicode_as_mapping = {
Martin v. Löwis18e16552006-02-15 17:27:45 +00007301 (lenfunc)unicode_length, /* mp_length */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007302 (binaryfunc)unicode_subscript, /* mp_subscript */
7303 (objobjargproc)0, /* mp_ass_subscript */
7304};
7305
Martin v. Löwis18e16552006-02-15 17:27:45 +00007306static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +00007307unicode_buffer_getreadbuf(PyUnicodeObject *self,
Martin v. Löwis18e16552006-02-15 17:27:45 +00007308 Py_ssize_t index,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007309 const void **ptr)
7310{
7311 if (index != 0) {
7312 PyErr_SetString(PyExc_SystemError,
7313 "accessing non-existent unicode segment");
7314 return -1;
7315 }
7316 *ptr = (void *) self->str;
7317 return PyUnicode_GET_DATA_SIZE(self);
7318}
7319
Martin v. Löwis18e16552006-02-15 17:27:45 +00007320static Py_ssize_t
7321unicode_buffer_getwritebuf(PyUnicodeObject *self, Py_ssize_t index,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007322 const void **ptr)
7323{
7324 PyErr_SetString(PyExc_TypeError,
Neal Norwitz20e72132002-06-13 21:25:17 +00007325 "cannot use unicode as modifiable buffer");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007326 return -1;
7327}
7328
7329static int
7330unicode_buffer_getsegcount(PyUnicodeObject *self,
Martin v. Löwis18e16552006-02-15 17:27:45 +00007331 Py_ssize_t *lenp)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007332{
7333 if (lenp)
7334 *lenp = PyUnicode_GET_DATA_SIZE(self);
7335 return 1;
7336}
7337
Martin v. Löwiseb079f12006-02-16 14:32:27 +00007338static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +00007339unicode_buffer_getcharbuf(PyUnicodeObject *self,
Martin v. Löwis18e16552006-02-15 17:27:45 +00007340 Py_ssize_t index,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007341 const void **ptr)
7342{
7343 PyObject *str;
Tim Petersced69f82003-09-16 20:30:58 +00007344
Guido van Rossumd57fd912000-03-10 22:53:23 +00007345 if (index != 0) {
7346 PyErr_SetString(PyExc_SystemError,
7347 "accessing non-existent unicode segment");
7348 return -1;
7349 }
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00007350 str = _PyUnicode_AsDefaultEncodedString((PyObject *)self, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007351 if (str == NULL)
7352 return -1;
7353 *ptr = (void *) PyString_AS_STRING(str);
7354 return PyString_GET_SIZE(str);
7355}
7356
7357/* Helpers for PyUnicode_Format() */
7358
7359static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +00007360getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007361{
Martin v. Löwis18e16552006-02-15 17:27:45 +00007362 Py_ssize_t argidx = *p_argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007363 if (argidx < arglen) {
7364 (*p_argidx)++;
7365 if (arglen < 0)
7366 return args;
7367 else
7368 return PyTuple_GetItem(args, argidx);
7369 }
7370 PyErr_SetString(PyExc_TypeError,
7371 "not enough arguments for format string");
7372 return NULL;
7373}
7374
7375#define F_LJUST (1<<0)
7376#define F_SIGN (1<<1)
7377#define F_BLANK (1<<2)
7378#define F_ALT (1<<3)
7379#define F_ZERO (1<<4)
7380
Martin v. Löwis18e16552006-02-15 17:27:45 +00007381static Py_ssize_t
Neal Norwitzfc76d632006-01-10 06:03:13 +00007382strtounicode(Py_UNICODE *buffer, const char *charbuffer)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007383{
Martin v. Löwis18e16552006-02-15 17:27:45 +00007384 register Py_ssize_t i;
7385 Py_ssize_t len = strlen(charbuffer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007386 for (i = len - 1; i >= 0; i--)
7387 buffer[i] = (Py_UNICODE) charbuffer[i];
7388
Guido van Rossumd57fd912000-03-10 22:53:23 +00007389 return len;
7390}
7391
Neal Norwitzfc76d632006-01-10 06:03:13 +00007392static int
7393doubletounicode(Py_UNICODE *buffer, size_t len, const char *format, double x)
7394{
Tim Peters15231542006-02-16 01:08:01 +00007395 Py_ssize_t result;
7396
Neal Norwitzfc76d632006-01-10 06:03:13 +00007397 PyOS_ascii_formatd((char *)buffer, len, format, x);
Tim Peters15231542006-02-16 01:08:01 +00007398 result = strtounicode(buffer, (char *)buffer);
7399 return Py_SAFE_DOWNCAST(result, Py_ssize_t, int);
Neal Norwitzfc76d632006-01-10 06:03:13 +00007400}
7401
7402static int
7403longtounicode(Py_UNICODE *buffer, size_t len, const char *format, long x)
7404{
Tim Peters15231542006-02-16 01:08:01 +00007405 Py_ssize_t result;
7406
Neal Norwitzfc76d632006-01-10 06:03:13 +00007407 PyOS_snprintf((char *)buffer, len, format, x);
Tim Peters15231542006-02-16 01:08:01 +00007408 result = strtounicode(buffer, (char *)buffer);
7409 return Py_SAFE_DOWNCAST(result, Py_ssize_t, int);
Neal Norwitzfc76d632006-01-10 06:03:13 +00007410}
7411
Guido van Rossum078151d2002-08-11 04:24:12 +00007412/* XXX To save some code duplication, formatfloat/long/int could have been
7413 shared with stringobject.c, converting from 8-bit to Unicode after the
7414 formatting is done. */
7415
Guido van Rossumd57fd912000-03-10 22:53:23 +00007416static int
7417formatfloat(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007418 size_t buflen,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007419 int flags,
7420 int prec,
7421 int type,
7422 PyObject *v)
7423{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007424 /* fmt = '%#.' + `prec` + `type`
7425 worst case length = 3 + 10 (len of INT_MAX) + 1 = 14 (use 20)*/
Guido van Rossumd57fd912000-03-10 22:53:23 +00007426 char fmt[20];
7427 double x;
Tim Petersced69f82003-09-16 20:30:58 +00007428
Guido van Rossumd57fd912000-03-10 22:53:23 +00007429 x = PyFloat_AsDouble(v);
7430 if (x == -1.0 && PyErr_Occurred())
7431 return -1;
7432 if (prec < 0)
7433 prec = 6;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007434 if (type == 'f' && (fabs(x) / 1e25) >= 1e25)
7435 type = 'g';
Marc-André Lemburg79f57832002-12-29 19:44:06 +00007436 /* Worst case length calc to ensure no buffer overrun:
7437
7438 'g' formats:
7439 fmt = %#.<prec>g
7440 buf = '-' + [0-9]*prec + '.' + 'e+' + (longest exp
7441 for any double rep.)
7442 len = 1 + prec + 1 + 2 + 5 = 9 + prec
7443
7444 'f' formats:
7445 buf = '-' + [0-9]*x + '.' + [0-9]*prec (with x < 50)
7446 len = 1 + 50 + 1 + prec = 52 + prec
7447
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007448 If prec=0 the effective precision is 1 (the leading digit is
Tim Petersced69f82003-09-16 20:30:58 +00007449 always given), therefore increase the length by one.
Marc-André Lemburg79f57832002-12-29 19:44:06 +00007450
7451 */
7452 if ((type == 'g' && buflen <= (size_t)10 + (size_t)prec) ||
7453 (type == 'f' && buflen <= (size_t)53 + (size_t)prec)) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007454 PyErr_SetString(PyExc_OverflowError,
Marc-André Lemburg79f57832002-12-29 19:44:06 +00007455 "formatted float is too long (precision too large?)");
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007456 return -1;
7457 }
Marc-André Lemburg79f57832002-12-29 19:44:06 +00007458 PyOS_snprintf(fmt, sizeof(fmt), "%%%s.%d%c",
7459 (flags&F_ALT) ? "#" : "",
7460 prec, type);
Neal Norwitzfc76d632006-01-10 06:03:13 +00007461 return doubletounicode(buf, buflen, fmt, x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007462}
7463
Tim Peters38fd5b62000-09-21 05:43:11 +00007464static PyObject*
7465formatlong(PyObject *val, int flags, int prec, int type)
7466{
7467 char *buf;
7468 int i, len;
7469 PyObject *str; /* temporary string object. */
7470 PyUnicodeObject *result;
7471
7472 str = _PyString_FormatLong(val, flags, prec, type, &buf, &len);
7473 if (!str)
7474 return NULL;
7475 result = _PyUnicode_New(len);
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00007476 if (!result) {
7477 Py_DECREF(str);
7478 return NULL;
7479 }
Tim Peters38fd5b62000-09-21 05:43:11 +00007480 for (i = 0; i < len; i++)
7481 result->str[i] = buf[i];
7482 result->str[len] = 0;
7483 Py_DECREF(str);
7484 return (PyObject*)result;
7485}
7486
Guido van Rossumd57fd912000-03-10 22:53:23 +00007487static int
7488formatint(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007489 size_t buflen,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007490 int flags,
7491 int prec,
7492 int type,
7493 PyObject *v)
7494{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007495 /* fmt = '%#.' + `prec` + 'l' + `type`
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00007496 * worst case length = 3 + 19 (worst len of INT_MAX on 64-bit machine)
7497 * + 1 + 1
7498 * = 24
7499 */
Tim Peters38fd5b62000-09-21 05:43:11 +00007500 char fmt[64]; /* plenty big enough! */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00007501 char *sign;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007502 long x;
7503
7504 x = PyInt_AsLong(v);
7505 if (x == -1 && PyErr_Occurred())
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00007506 return -1;
Guido van Rossum6c9e1302003-11-29 23:52:13 +00007507 if (x < 0 && type == 'u') {
7508 type = 'd';
Guido van Rossum078151d2002-08-11 04:24:12 +00007509 }
Guido van Rossum6c9e1302003-11-29 23:52:13 +00007510 if (x < 0 && (type == 'x' || type == 'X' || type == 'o'))
7511 sign = "-";
7512 else
7513 sign = "";
Guido van Rossumd57fd912000-03-10 22:53:23 +00007514 if (prec < 0)
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00007515 prec = 1;
7516
Guido van Rossum6c9e1302003-11-29 23:52:13 +00007517 /* buf = '+'/'-'/'' + '0'/'0x'/'' + '[0-9]'*max(prec, len(x in octal))
7518 * worst case buf = '-0x' + [0-9]*prec, where prec >= 11
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00007519 */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00007520 if (buflen <= 14 || buflen <= (size_t)3 + (size_t)prec) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007521 PyErr_SetString(PyExc_OverflowError,
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00007522 "formatted integer is too long (precision too large?)");
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007523 return -1;
7524 }
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00007525
7526 if ((flags & F_ALT) &&
7527 (type == 'x' || type == 'X')) {
Tim Petersced69f82003-09-16 20:30:58 +00007528 /* When converting under %#x or %#X, there are a number
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00007529 * of issues that cause pain:
7530 * - when 0 is being converted, the C standard leaves off
7531 * the '0x' or '0X', which is inconsistent with other
7532 * %#x/%#X conversions and inconsistent with Python's
7533 * hex() function
7534 * - there are platforms that violate the standard and
7535 * convert 0 with the '0x' or '0X'
7536 * (Metrowerks, Compaq Tru64)
7537 * - there are platforms that give '0x' when converting
Tim Petersced69f82003-09-16 20:30:58 +00007538 * under %#X, but convert 0 in accordance with the
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00007539 * standard (OS/2 EMX)
Tim Petersced69f82003-09-16 20:30:58 +00007540 *
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00007541 * We can achieve the desired consistency by inserting our
7542 * own '0x' or '0X' prefix, and substituting %x/%X in place
7543 * of %#x/%#X.
7544 *
7545 * Note that this is the same approach as used in
7546 * formatint() in stringobject.c
Andrew MacIntyrec4874392002-02-26 11:36:35 +00007547 */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00007548 PyOS_snprintf(fmt, sizeof(fmt), "%s0%c%%.%dl%c",
7549 sign, type, prec, type);
Andrew MacIntyrec4874392002-02-26 11:36:35 +00007550 }
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00007551 else {
Guido van Rossum6c9e1302003-11-29 23:52:13 +00007552 PyOS_snprintf(fmt, sizeof(fmt), "%s%%%s.%dl%c",
7553 sign, (flags&F_ALT) ? "#" : "",
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00007554 prec, type);
Tim Petersb3d8d1f2001-04-28 05:38:26 +00007555 }
Guido van Rossum6c9e1302003-11-29 23:52:13 +00007556 if (sign[0])
Neal Norwitzfc76d632006-01-10 06:03:13 +00007557 return longtounicode(buf, buflen, fmt, -x);
Guido van Rossum6c9e1302003-11-29 23:52:13 +00007558 else
Neal Norwitzfc76d632006-01-10 06:03:13 +00007559 return longtounicode(buf, buflen, fmt, x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007560}
7561
7562static int
7563formatchar(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007564 size_t buflen,
7565 PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007566{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007567 /* presume that the buffer is at least 2 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00007568 if (PyUnicode_Check(v)) {
7569 if (PyUnicode_GET_SIZE(v) != 1)
7570 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007571 buf[0] = PyUnicode_AS_UNICODE(v)[0];
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00007572 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007573
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00007574 else if (PyString_Check(v)) {
Tim Petersced69f82003-09-16 20:30:58 +00007575 if (PyString_GET_SIZE(v) != 1)
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00007576 goto onError;
7577 buf[0] = (Py_UNICODE)PyString_AS_STRING(v)[0];
7578 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007579
7580 else {
7581 /* Integer input truncated to a character */
7582 long x;
7583 x = PyInt_AsLong(v);
7584 if (x == -1 && PyErr_Occurred())
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00007585 goto onError;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00007586#ifdef Py_UNICODE_WIDE
7587 if (x < 0 || x > 0x10ffff) {
Walter Dörwald44f527f2003-04-02 16:37:24 +00007588 PyErr_SetString(PyExc_OverflowError,
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00007589 "%c arg not in range(0x110000) "
7590 "(wide Python build)");
7591 return -1;
7592 }
7593#else
7594 if (x < 0 || x > 0xffff) {
Walter Dörwald44f527f2003-04-02 16:37:24 +00007595 PyErr_SetString(PyExc_OverflowError,
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00007596 "%c arg not in range(0x10000) "
7597 "(narrow Python build)");
7598 return -1;
7599 }
7600#endif
7601 buf[0] = (Py_UNICODE) x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007602 }
7603 buf[1] = '\0';
7604 return 1;
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00007605
7606 onError:
7607 PyErr_SetString(PyExc_TypeError,
7608 "%c requires int or char");
7609 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007610}
7611
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007612/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
7613
7614 FORMATBUFLEN is the length of the buffer in which the floats, ints, &
7615 chars are formatted. XXX This is a magic number. Each formatting
7616 routine does bounds checking to ensure no overflow, but a better
7617 solution may be to malloc a buffer of appropriate size for each
7618 format. For now, the current solution is sufficient.
7619*/
7620#define FORMATBUFLEN (size_t)120
7621
Guido van Rossumd57fd912000-03-10 22:53:23 +00007622PyObject *PyUnicode_Format(PyObject *format,
7623 PyObject *args)
7624{
7625 Py_UNICODE *fmt, *res;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007626 Py_ssize_t fmtcnt, rescnt, reslen, arglen, argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007627 int args_owned = 0;
7628 PyUnicodeObject *result = NULL;
7629 PyObject *dict = NULL;
7630 PyObject *uformat;
Tim Petersced69f82003-09-16 20:30:58 +00007631
Guido van Rossumd57fd912000-03-10 22:53:23 +00007632 if (format == NULL || args == NULL) {
7633 PyErr_BadInternalCall();
7634 return NULL;
7635 }
7636 uformat = PyUnicode_FromObject(format);
Fred Drakee4315f52000-05-09 19:53:39 +00007637 if (uformat == NULL)
7638 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007639 fmt = PyUnicode_AS_UNICODE(uformat);
7640 fmtcnt = PyUnicode_GET_SIZE(uformat);
7641
7642 reslen = rescnt = fmtcnt + 100;
7643 result = _PyUnicode_New(reslen);
7644 if (result == NULL)
7645 goto onError;
7646 res = PyUnicode_AS_UNICODE(result);
7647
7648 if (PyTuple_Check(args)) {
7649 arglen = PyTuple_Size(args);
7650 argidx = 0;
7651 }
7652 else {
7653 arglen = -1;
7654 argidx = -2;
7655 }
Neal Norwitz80a1bf42002-11-12 23:01:12 +00007656 if (args->ob_type->tp_as_mapping && !PyTuple_Check(args) &&
7657 !PyObject_TypeCheck(args, &PyBaseString_Type))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007658 dict = args;
7659
7660 while (--fmtcnt >= 0) {
7661 if (*fmt != '%') {
7662 if (--rescnt < 0) {
7663 rescnt = fmtcnt + 100;
7664 reslen += rescnt;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00007665 if (_PyUnicode_Resize(&result, reslen) < 0)
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00007666 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007667 res = PyUnicode_AS_UNICODE(result) + reslen - rescnt;
7668 --rescnt;
7669 }
7670 *res++ = *fmt++;
7671 }
7672 else {
7673 /* Got a format specifier */
7674 int flags = 0;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007675 Py_ssize_t width = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007676 int prec = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007677 Py_UNICODE c = '\0';
7678 Py_UNICODE fill;
7679 PyObject *v = NULL;
7680 PyObject *temp = NULL;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007681 Py_UNICODE *pbuf;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007682 Py_UNICODE sign;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007683 Py_ssize_t len;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007684 Py_UNICODE formatbuf[FORMATBUFLEN]; /* For format{float,int,char}() */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007685
7686 fmt++;
7687 if (*fmt == '(') {
7688 Py_UNICODE *keystart;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007689 Py_ssize_t keylen;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007690 PyObject *key;
7691 int pcount = 1;
7692
7693 if (dict == NULL) {
7694 PyErr_SetString(PyExc_TypeError,
Tim Petersced69f82003-09-16 20:30:58 +00007695 "format requires a mapping");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007696 goto onError;
7697 }
7698 ++fmt;
7699 --fmtcnt;
7700 keystart = fmt;
7701 /* Skip over balanced parentheses */
7702 while (pcount > 0 && --fmtcnt >= 0) {
7703 if (*fmt == ')')
7704 --pcount;
7705 else if (*fmt == '(')
7706 ++pcount;
7707 fmt++;
7708 }
7709 keylen = fmt - keystart - 1;
7710 if (fmtcnt < 0 || pcount > 0) {
7711 PyErr_SetString(PyExc_ValueError,
7712 "incomplete format key");
7713 goto onError;
7714 }
Marc-André Lemburg72f82132001-11-20 15:18:49 +00007715#if 0
Fred Drakee4315f52000-05-09 19:53:39 +00007716 /* keys are converted to strings using UTF-8 and
Guido van Rossumd57fd912000-03-10 22:53:23 +00007717 then looked up since Python uses strings to hold
7718 variables names etc. in its namespaces and we
Fred Drakee4315f52000-05-09 19:53:39 +00007719 wouldn't want to break common idioms. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007720 key = PyUnicode_EncodeUTF8(keystart,
7721 keylen,
7722 NULL);
Marc-André Lemburg72f82132001-11-20 15:18:49 +00007723#else
7724 key = PyUnicode_FromUnicode(keystart, keylen);
7725#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00007726 if (key == NULL)
7727 goto onError;
7728 if (args_owned) {
7729 Py_DECREF(args);
7730 args_owned = 0;
7731 }
7732 args = PyObject_GetItem(dict, key);
7733 Py_DECREF(key);
7734 if (args == NULL) {
7735 goto onError;
7736 }
7737 args_owned = 1;
7738 arglen = -1;
7739 argidx = -2;
7740 }
7741 while (--fmtcnt >= 0) {
7742 switch (c = *fmt++) {
7743 case '-': flags |= F_LJUST; continue;
7744 case '+': flags |= F_SIGN; continue;
7745 case ' ': flags |= F_BLANK; continue;
7746 case '#': flags |= F_ALT; continue;
7747 case '0': flags |= F_ZERO; continue;
7748 }
7749 break;
7750 }
7751 if (c == '*') {
7752 v = getnextarg(args, arglen, &argidx);
7753 if (v == NULL)
7754 goto onError;
7755 if (!PyInt_Check(v)) {
7756 PyErr_SetString(PyExc_TypeError,
7757 "* wants int");
7758 goto onError;
7759 }
7760 width = PyInt_AsLong(v);
Guido van Rossumddefaf32007-01-14 03:31:43 +00007761 if (width == -1 && PyErr_Occurred())
7762 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007763 if (width < 0) {
7764 flags |= F_LJUST;
7765 width = -width;
7766 }
7767 if (--fmtcnt >= 0)
7768 c = *fmt++;
7769 }
7770 else if (c >= '0' && c <= '9') {
7771 width = c - '0';
7772 while (--fmtcnt >= 0) {
7773 c = *fmt++;
7774 if (c < '0' || c > '9')
7775 break;
7776 if ((width*10) / 10 != width) {
7777 PyErr_SetString(PyExc_ValueError,
7778 "width too big");
7779 goto onError;
7780 }
7781 width = width*10 + (c - '0');
7782 }
7783 }
7784 if (c == '.') {
7785 prec = 0;
7786 if (--fmtcnt >= 0)
7787 c = *fmt++;
7788 if (c == '*') {
7789 v = getnextarg(args, arglen, &argidx);
7790 if (v == NULL)
7791 goto onError;
7792 if (!PyInt_Check(v)) {
7793 PyErr_SetString(PyExc_TypeError,
7794 "* wants int");
7795 goto onError;
7796 }
7797 prec = PyInt_AsLong(v);
Guido van Rossumddefaf32007-01-14 03:31:43 +00007798 if (prec == -1 && PyErr_Occurred())
7799 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007800 if (prec < 0)
7801 prec = 0;
7802 if (--fmtcnt >= 0)
7803 c = *fmt++;
7804 }
7805 else if (c >= '0' && c <= '9') {
7806 prec = c - '0';
7807 while (--fmtcnt >= 0) {
7808 c = Py_CHARMASK(*fmt++);
7809 if (c < '0' || c > '9')
7810 break;
7811 if ((prec*10) / 10 != prec) {
7812 PyErr_SetString(PyExc_ValueError,
7813 "prec too big");
7814 goto onError;
7815 }
7816 prec = prec*10 + (c - '0');
7817 }
7818 }
7819 } /* prec */
7820 if (fmtcnt >= 0) {
7821 if (c == 'h' || c == 'l' || c == 'L') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007822 if (--fmtcnt >= 0)
7823 c = *fmt++;
7824 }
7825 }
7826 if (fmtcnt < 0) {
7827 PyErr_SetString(PyExc_ValueError,
7828 "incomplete format");
7829 goto onError;
7830 }
7831 if (c != '%') {
7832 v = getnextarg(args, arglen, &argidx);
7833 if (v == NULL)
7834 goto onError;
7835 }
7836 sign = 0;
7837 fill = ' ';
7838 switch (c) {
7839
7840 case '%':
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007841 pbuf = formatbuf;
7842 /* presume that buffer length is at least 1 */
7843 pbuf[0] = '%';
Guido van Rossumd57fd912000-03-10 22:53:23 +00007844 len = 1;
7845 break;
7846
7847 case 's':
7848 case 'r':
7849 if (PyUnicode_Check(v) && c == 's') {
7850 temp = v;
7851 Py_INCREF(temp);
7852 }
7853 else {
7854 PyObject *unicode;
7855 if (c == 's')
Marc-André Lemburgd25c6502004-07-23 16:13:25 +00007856 temp = PyObject_Unicode(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007857 else
7858 temp = PyObject_Repr(v);
7859 if (temp == NULL)
7860 goto onError;
Marc-André Lemburgd25c6502004-07-23 16:13:25 +00007861 if (PyUnicode_Check(temp))
7862 /* nothing to do */;
7863 else if (PyString_Check(temp)) {
7864 /* convert to string to Unicode */
Thomas Woutersa96affe2006-03-12 00:29:36 +00007865 unicode = PyUnicode_Decode(PyString_AS_STRING(temp),
Guido van Rossumd57fd912000-03-10 22:53:23 +00007866 PyString_GET_SIZE(temp),
Thomas Woutersa96affe2006-03-12 00:29:36 +00007867 NULL,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007868 "strict");
Thomas Woutersa96affe2006-03-12 00:29:36 +00007869 Py_DECREF(temp);
7870 temp = unicode;
7871 if (temp == NULL)
7872 goto onError;
7873 }
Marc-André Lemburgd25c6502004-07-23 16:13:25 +00007874 else {
7875 Py_DECREF(temp);
7876 PyErr_SetString(PyExc_TypeError,
7877 "%s argument has non-string str()");
7878 goto onError;
7879 }
7880 }
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007881 pbuf = PyUnicode_AS_UNICODE(temp);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007882 len = PyUnicode_GET_SIZE(temp);
7883 if (prec >= 0 && len > prec)
7884 len = prec;
7885 break;
7886
7887 case 'i':
7888 case 'd':
7889 case 'u':
7890 case 'o':
7891 case 'x':
7892 case 'X':
7893 if (c == 'i')
7894 c = 'd';
Tim Petersa3a3a032000-11-30 05:22:44 +00007895 if (PyLong_Check(v)) {
Tim Peters38fd5b62000-09-21 05:43:11 +00007896 temp = formatlong(v, flags, prec, c);
7897 if (!temp)
7898 goto onError;
7899 pbuf = PyUnicode_AS_UNICODE(temp);
7900 len = PyUnicode_GET_SIZE(temp);
Tim Peters38fd5b62000-09-21 05:43:11 +00007901 sign = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007902 }
Tim Peters38fd5b62000-09-21 05:43:11 +00007903 else {
7904 pbuf = formatbuf;
7905 len = formatint(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
7906 flags, prec, c, v);
7907 if (len < 0)
7908 goto onError;
Guido van Rossum6c9e1302003-11-29 23:52:13 +00007909 sign = 1;
Tim Peters38fd5b62000-09-21 05:43:11 +00007910 }
7911 if (flags & F_ZERO)
7912 fill = '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00007913 break;
7914
7915 case 'e':
7916 case 'E':
7917 case 'f':
Raymond Hettinger9bfe5332003-08-27 04:55:52 +00007918 case 'F':
Guido van Rossumd57fd912000-03-10 22:53:23 +00007919 case 'g':
7920 case 'G':
Raymond Hettinger9bfe5332003-08-27 04:55:52 +00007921 if (c == 'F')
7922 c = 'f';
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007923 pbuf = formatbuf;
7924 len = formatfloat(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
7925 flags, prec, c, v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007926 if (len < 0)
7927 goto onError;
7928 sign = 1;
Tim Peters38fd5b62000-09-21 05:43:11 +00007929 if (flags & F_ZERO)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007930 fill = '0';
7931 break;
7932
7933 case 'c':
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007934 pbuf = formatbuf;
7935 len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007936 if (len < 0)
7937 goto onError;
7938 break;
7939
7940 default:
7941 PyErr_Format(PyExc_ValueError,
Andrew M. Kuchling6ca89172000-12-15 13:07:46 +00007942 "unsupported format character '%c' (0x%x) "
Thomas Wouters89f507f2006-12-13 04:49:30 +00007943 "at index %zd",
Tim Petersced69f82003-09-16 20:30:58 +00007944 (31<=c && c<=126) ? (char)c : '?',
Marc-André Lemburg24e53b62002-09-24 09:32:14 +00007945 (int)c,
Thomas Wouters89f507f2006-12-13 04:49:30 +00007946 (Py_ssize_t)(fmt - 1 -
7947 PyUnicode_AS_UNICODE(uformat)));
Guido van Rossumd57fd912000-03-10 22:53:23 +00007948 goto onError;
7949 }
7950 if (sign) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007951 if (*pbuf == '-' || *pbuf == '+') {
7952 sign = *pbuf++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007953 len--;
7954 }
7955 else if (flags & F_SIGN)
7956 sign = '+';
7957 else if (flags & F_BLANK)
7958 sign = ' ';
7959 else
7960 sign = 0;
7961 }
7962 if (width < len)
7963 width = len;
Guido van Rossum049cd6b2002-10-11 00:43:48 +00007964 if (rescnt - (sign != 0) < width) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007965 reslen -= rescnt;
7966 rescnt = width + fmtcnt + 100;
7967 reslen += rescnt;
Guido van Rossum049cd6b2002-10-11 00:43:48 +00007968 if (reslen < 0) {
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00007969 Py_XDECREF(temp);
Thomas Woutersa96affe2006-03-12 00:29:36 +00007970 PyErr_NoMemory();
7971 goto onError;
Guido van Rossum049cd6b2002-10-11 00:43:48 +00007972 }
Thomas Woutersa96affe2006-03-12 00:29:36 +00007973 if (_PyUnicode_Resize(&result, reslen) < 0) {
7974 Py_XDECREF(temp);
7975 goto onError;
7976 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007977 res = PyUnicode_AS_UNICODE(result)
7978 + reslen - rescnt;
7979 }
7980 if (sign) {
7981 if (fill != ' ')
7982 *res++ = sign;
7983 rescnt--;
7984 if (width > len)
7985 width--;
7986 }
Tim Peters38fd5b62000-09-21 05:43:11 +00007987 if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
7988 assert(pbuf[0] == '0');
Tim Petersfff53252001-04-12 18:38:48 +00007989 assert(pbuf[1] == c);
7990 if (fill != ' ') {
7991 *res++ = *pbuf++;
7992 *res++ = *pbuf++;
Tim Peters38fd5b62000-09-21 05:43:11 +00007993 }
Tim Petersfff53252001-04-12 18:38:48 +00007994 rescnt -= 2;
7995 width -= 2;
7996 if (width < 0)
7997 width = 0;
7998 len -= 2;
Tim Peters38fd5b62000-09-21 05:43:11 +00007999 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008000 if (width > len && !(flags & F_LJUST)) {
8001 do {
8002 --rescnt;
8003 *res++ = fill;
8004 } while (--width > len);
8005 }
Tim Peters38fd5b62000-09-21 05:43:11 +00008006 if (fill == ' ') {
8007 if (sign)
8008 *res++ = sign;
Tim Petersfff53252001-04-12 18:38:48 +00008009 if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
Tim Peters38fd5b62000-09-21 05:43:11 +00008010 assert(pbuf[0] == '0');
Tim Petersfff53252001-04-12 18:38:48 +00008011 assert(pbuf[1] == c);
Tim Peters38fd5b62000-09-21 05:43:11 +00008012 *res++ = *pbuf++;
8013 *res++ = *pbuf++;
8014 }
8015 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008016 Py_UNICODE_COPY(res, pbuf, len);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008017 res += len;
8018 rescnt -= len;
8019 while (--width >= len) {
8020 --rescnt;
8021 *res++ = ' ';
8022 }
8023 if (dict && (argidx < arglen) && c != '%') {
8024 PyErr_SetString(PyExc_TypeError,
Raymond Hettinger0ebac972002-05-21 15:14:57 +00008025 "not all arguments converted during string formatting");
Thomas Woutersa96affe2006-03-12 00:29:36 +00008026 Py_XDECREF(temp);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008027 goto onError;
8028 }
8029 Py_XDECREF(temp);
8030 } /* '%' */
8031 } /* until end */
8032 if (argidx < arglen && !dict) {
8033 PyErr_SetString(PyExc_TypeError,
Raymond Hettinger0ebac972002-05-21 15:14:57 +00008034 "not all arguments converted during string formatting");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008035 goto onError;
8036 }
8037
Thomas Woutersa96affe2006-03-12 00:29:36 +00008038 if (_PyUnicode_Resize(&result, reslen - rescnt) < 0)
8039 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008040 if (args_owned) {
8041 Py_DECREF(args);
8042 }
8043 Py_DECREF(uformat);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008044 return (PyObject *)result;
8045
8046 onError:
8047 Py_XDECREF(result);
8048 Py_DECREF(uformat);
8049 if (args_owned) {
8050 Py_DECREF(args);
8051 }
8052 return NULL;
8053}
8054
8055static PyBufferProcs unicode_as_buffer = {
Martin v. Löwis18e16552006-02-15 17:27:45 +00008056 (readbufferproc) unicode_buffer_getreadbuf,
8057 (writebufferproc) unicode_buffer_getwritebuf,
8058 (segcountproc) unicode_buffer_getsegcount,
8059 (charbufferproc) unicode_buffer_getcharbuf,
Guido van Rossumd57fd912000-03-10 22:53:23 +00008060};
8061
Jeremy Hylton938ace62002-07-17 16:30:39 +00008062static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +00008063unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
8064
Tim Peters6d6c1a32001-08-02 04:15:00 +00008065static PyObject *
8066unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
8067{
8068 PyObject *x = NULL;
Martin v. Löwis15e62742006-02-27 16:46:16 +00008069 static char *kwlist[] = {"string", "encoding", "errors", 0};
Tim Peters6d6c1a32001-08-02 04:15:00 +00008070 char *encoding = NULL;
8071 char *errors = NULL;
8072
Guido van Rossume023fe02001-08-30 03:12:59 +00008073 if (type != &PyUnicode_Type)
8074 return unicode_subtype_new(type, args, kwds);
Tim Peters6d6c1a32001-08-02 04:15:00 +00008075 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:unicode",
8076 kwlist, &x, &encoding, &errors))
8077 return NULL;
8078 if (x == NULL)
8079 return (PyObject *)_PyUnicode_New(0);
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00008080 if (encoding == NULL && errors == NULL)
8081 return PyObject_Unicode(x);
8082 else
Tim Peters6d6c1a32001-08-02 04:15:00 +00008083 return PyUnicode_FromEncodedObject(x, encoding, errors);
8084}
8085
Guido van Rossume023fe02001-08-30 03:12:59 +00008086static PyObject *
8087unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
8088{
Tim Petersaf90b3e2001-09-12 05:18:58 +00008089 PyUnicodeObject *tmp, *pnew;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008090 Py_ssize_t n;
Guido van Rossume023fe02001-08-30 03:12:59 +00008091
8092 assert(PyType_IsSubtype(type, &PyUnicode_Type));
8093 tmp = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds);
8094 if (tmp == NULL)
8095 return NULL;
8096 assert(PyUnicode_Check(tmp));
Tim Petersaf90b3e2001-09-12 05:18:58 +00008097 pnew = (PyUnicodeObject *) type->tp_alloc(type, n = tmp->length);
Raymond Hettingerf4667932003-06-28 20:04:25 +00008098 if (pnew == NULL) {
8099 Py_DECREF(tmp);
Guido van Rossume023fe02001-08-30 03:12:59 +00008100 return NULL;
Raymond Hettingerf4667932003-06-28 20:04:25 +00008101 }
Tim Petersaf90b3e2001-09-12 05:18:58 +00008102 pnew->str = PyMem_NEW(Py_UNICODE, n+1);
8103 if (pnew->str == NULL) {
8104 _Py_ForgetReference((PyObject *)pnew);
Neil Schemenauer58aa8612002-04-12 03:07:20 +00008105 PyObject_Del(pnew);
Raymond Hettingerf4667932003-06-28 20:04:25 +00008106 Py_DECREF(tmp);
Neal Norwitzec74f2f2003-02-11 23:05:40 +00008107 return PyErr_NoMemory();
Guido van Rossume023fe02001-08-30 03:12:59 +00008108 }
Tim Petersaf90b3e2001-09-12 05:18:58 +00008109 Py_UNICODE_COPY(pnew->str, tmp->str, n+1);
8110 pnew->length = n;
8111 pnew->hash = tmp->hash;
Guido van Rossume023fe02001-08-30 03:12:59 +00008112 Py_DECREF(tmp);
Tim Petersaf90b3e2001-09-12 05:18:58 +00008113 return (PyObject *)pnew;
Guido van Rossume023fe02001-08-30 03:12:59 +00008114}
8115
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008116PyDoc_STRVAR(unicode_doc,
Tim Peters6d6c1a32001-08-02 04:15:00 +00008117"unicode(string [, encoding[, errors]]) -> object\n\
8118\n\
8119Create a new Unicode object from the given encoded string.\n\
Skip Montanaro35b37a52002-07-26 16:22:46 +00008120encoding defaults to the current default string encoding.\n\
8121errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +00008122
Guido van Rossum50e9fb92006-08-17 05:42:55 +00008123static PyObject *unicode_iter(PyObject *seq);
8124
Guido van Rossumd57fd912000-03-10 22:53:23 +00008125PyTypeObject PyUnicode_Type = {
8126 PyObject_HEAD_INIT(&PyType_Type)
8127 0, /* ob_size */
Guido van Rossum84fc66d2007-05-03 17:18:26 +00008128 "str", /* tp_name */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008129 sizeof(PyUnicodeObject), /* tp_size */
8130 0, /* tp_itemsize */
8131 /* Slots */
Guido van Rossum9475a232001-10-05 20:51:39 +00008132 (destructor)unicode_dealloc, /* tp_dealloc */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008133 0, /* tp_print */
Tim Peters6d6c1a32001-08-02 04:15:00 +00008134 0, /* tp_getattr */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008135 0, /* tp_setattr */
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00008136 0, /* tp_compare */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008137 unicode_repr, /* tp_repr */
Neil Schemenauerce30bc92002-11-18 16:10:18 +00008138 &unicode_as_number, /* tp_as_number */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008139 &unicode_as_sequence, /* tp_as_sequence */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008140 &unicode_as_mapping, /* tp_as_mapping */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008141 (hashfunc) unicode_hash, /* tp_hash*/
8142 0, /* tp_call*/
8143 (reprfunc) unicode_str, /* tp_str */
Tim Peters6d6c1a32001-08-02 04:15:00 +00008144 PyObject_GenericGetAttr, /* tp_getattro */
8145 0, /* tp_setattro */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008146 &unicode_as_buffer, /* tp_as_buffer */
Thomas Wouters27d517b2007-02-25 20:39:11 +00008147 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
8148 Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
Tim Peters6d6c1a32001-08-02 04:15:00 +00008149 unicode_doc, /* tp_doc */
8150 0, /* tp_traverse */
8151 0, /* tp_clear */
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00008152 PyUnicode_RichCompare, /* tp_richcompare */
Tim Peters6d6c1a32001-08-02 04:15:00 +00008153 0, /* tp_weaklistoffset */
Guido van Rossum50e9fb92006-08-17 05:42:55 +00008154 unicode_iter, /* tp_iter */
Tim Peters6d6c1a32001-08-02 04:15:00 +00008155 0, /* tp_iternext */
8156 unicode_methods, /* tp_methods */
8157 0, /* tp_members */
8158 0, /* tp_getset */
Guido van Rossumcacfc072002-05-24 19:01:59 +00008159 &PyBaseString_Type, /* tp_base */
Tim Peters6d6c1a32001-08-02 04:15:00 +00008160 0, /* tp_dict */
8161 0, /* tp_descr_get */
8162 0, /* tp_descr_set */
8163 0, /* tp_dictoffset */
8164 0, /* tp_init */
8165 0, /* tp_alloc */
8166 unicode_new, /* tp_new */
Neil Schemenauer58aa8612002-04-12 03:07:20 +00008167 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008168};
8169
8170/* Initialize the Unicode implementation */
8171
Thomas Wouters78890102000-07-22 19:25:51 +00008172void _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008173{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008174 int i;
8175
Thomas Wouters477c8d52006-05-27 19:21:47 +00008176 /* XXX - move this array to unicodectype.c ? */
8177 Py_UNICODE linebreak[] = {
8178 0x000A, /* LINE FEED */
8179 0x000D, /* CARRIAGE RETURN */
8180 0x001C, /* FILE SEPARATOR */
8181 0x001D, /* GROUP SEPARATOR */
8182 0x001E, /* RECORD SEPARATOR */
8183 0x0085, /* NEXT LINE */
8184 0x2028, /* LINE SEPARATOR */
8185 0x2029, /* PARAGRAPH SEPARATOR */
8186 };
8187
Fred Drakee4315f52000-05-09 19:53:39 +00008188 /* Init the implementation */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008189 unicode_freelist = NULL;
8190 unicode_freelist_size = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008191 unicode_empty = _PyUnicode_New(0);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008192 if (!unicode_empty)
8193 return;
8194
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008195 for (i = 0; i < 256; i++)
8196 unicode_latin1[i] = NULL;
Guido van Rossumcacfc072002-05-24 19:01:59 +00008197 if (PyType_Ready(&PyUnicode_Type) < 0)
8198 Py_FatalError("Can't initialize 'unicode'");
Thomas Wouters477c8d52006-05-27 19:21:47 +00008199
8200 /* initialize the linebreak bloom filter */
8201 bloom_linebreak = make_bloom_mask(
8202 linebreak, sizeof(linebreak) / sizeof(linebreak[0])
8203 );
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008204
8205 PyType_Ready(&EncodingMapType);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008206}
8207
8208/* Finalize the Unicode implementation */
8209
8210void
Thomas Wouters78890102000-07-22 19:25:51 +00008211_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008212{
Barry Warsaw5b4c2282000-10-03 20:45:26 +00008213 PyUnicodeObject *u;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008214 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008215
Guido van Rossum4ae8ef82000-10-03 18:09:04 +00008216 Py_XDECREF(unicode_empty);
8217 unicode_empty = NULL;
Barry Warsaw5b4c2282000-10-03 20:45:26 +00008218
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008219 for (i = 0; i < 256; i++) {
8220 if (unicode_latin1[i]) {
8221 Py_DECREF(unicode_latin1[i]);
8222 unicode_latin1[i] = NULL;
8223 }
8224 }
8225
Barry Warsaw5b4c2282000-10-03 20:45:26 +00008226 for (u = unicode_freelist; u != NULL;) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00008227 PyUnicodeObject *v = u;
8228 u = *(PyUnicodeObject **)u;
Guido van Rossumfd4b9572000-04-10 13:51:10 +00008229 if (v->str)
Guido van Rossumb18618d2000-05-03 23:44:39 +00008230 PyMem_DEL(v->str);
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00008231 Py_XDECREF(v->defenc);
Neil Schemenauer58aa8612002-04-12 03:07:20 +00008232 PyObject_Del(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008233 }
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008234 unicode_freelist = NULL;
8235 unicode_freelist_size = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008236}
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00008237
Guido van Rossum50e9fb92006-08-17 05:42:55 +00008238
8239
8240/********************* Unicode Iterator **************************/
8241
8242typedef struct {
8243 PyObject_HEAD
Guido van Rossum49d6b072006-08-17 21:11:47 +00008244 Py_ssize_t it_index;
Guido van Rossum50e9fb92006-08-17 05:42:55 +00008245 PyUnicodeObject *it_seq; /* Set to NULL when iterator is exhausted */
8246} unicodeiterobject;
8247
8248static void
8249unicodeiter_dealloc(unicodeiterobject *it)
8250{
8251 _PyObject_GC_UNTRACK(it);
8252 Py_XDECREF(it->it_seq);
8253 PyObject_GC_Del(it);
8254}
8255
8256static int
8257unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
8258{
8259 Py_VISIT(it->it_seq);
8260 return 0;
8261}
8262
8263static PyObject *
8264unicodeiter_next(unicodeiterobject *it)
8265{
8266 PyUnicodeObject *seq;
8267 PyObject *item;
8268
8269 assert(it != NULL);
8270 seq = it->it_seq;
8271 if (seq == NULL)
8272 return NULL;
8273 assert(PyUnicode_Check(seq));
8274
8275 if (it->it_index < PyUnicode_GET_SIZE(seq)) {
Guido van Rossum49d6b072006-08-17 21:11:47 +00008276 item = PyUnicode_FromUnicode(
8277 PyUnicode_AS_UNICODE(seq)+it->it_index, 1);
Guido van Rossum50e9fb92006-08-17 05:42:55 +00008278 if (item != NULL)
8279 ++it->it_index;
8280 return item;
8281 }
8282
8283 Py_DECREF(seq);
8284 it->it_seq = NULL;
8285 return NULL;
8286}
8287
8288static PyObject *
8289unicodeiter_len(unicodeiterobject *it)
8290{
8291 Py_ssize_t len = 0;
8292 if (it->it_seq)
8293 len = PyUnicode_GET_SIZE(it->it_seq) - it->it_index;
8294 return PyInt_FromSsize_t(len);
8295}
8296
8297PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
8298
8299static PyMethodDef unicodeiter_methods[] = {
Guido van Rossum49d6b072006-08-17 21:11:47 +00008300 {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
8301 length_hint_doc},
Guido van Rossum50e9fb92006-08-17 05:42:55 +00008302 {NULL, NULL} /* sentinel */
8303};
8304
8305PyTypeObject PyUnicodeIter_Type = {
8306 PyObject_HEAD_INIT(&PyType_Type)
8307 0, /* ob_size */
8308 "unicodeiterator", /* tp_name */
8309 sizeof(unicodeiterobject), /* tp_basicsize */
8310 0, /* tp_itemsize */
8311 /* methods */
Guido van Rossum49d6b072006-08-17 21:11:47 +00008312 (destructor)unicodeiter_dealloc, /* tp_dealloc */
Guido van Rossum50e9fb92006-08-17 05:42:55 +00008313 0, /* tp_print */
8314 0, /* tp_getattr */
8315 0, /* tp_setattr */
8316 0, /* tp_compare */
8317 0, /* tp_repr */
8318 0, /* tp_as_number */
8319 0, /* tp_as_sequence */
8320 0, /* tp_as_mapping */
8321 0, /* tp_hash */
8322 0, /* tp_call */
8323 0, /* tp_str */
8324 PyObject_GenericGetAttr, /* tp_getattro */
8325 0, /* tp_setattro */
8326 0, /* tp_as_buffer */
8327 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
8328 0, /* tp_doc */
8329 (traverseproc)unicodeiter_traverse, /* tp_traverse */
8330 0, /* tp_clear */
8331 0, /* tp_richcompare */
8332 0, /* tp_weaklistoffset */
8333 PyObject_SelfIter, /* tp_iter */
8334 (iternextfunc)unicodeiter_next, /* tp_iternext */
8335 unicodeiter_methods, /* tp_methods */
8336 0,
8337};
8338
8339static PyObject *
8340unicode_iter(PyObject *seq)
8341{
8342 unicodeiterobject *it;
8343
8344 if (!PyUnicode_Check(seq)) {
8345 PyErr_BadInternalCall();
8346 return NULL;
8347 }
8348 it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
8349 if (it == NULL)
8350 return NULL;
8351 it->it_index = 0;
8352 Py_INCREF(seq);
8353 it->it_seq = (PyUnicodeObject *)seq;
8354 _PyObject_GC_TRACK(it);
8355 return (PyObject *)it;
8356}
8357
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008358#ifdef __cplusplus
8359}
8360#endif
8361
8362
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00008363/*
8364Local variables:
8365c-basic-offset: 4
8366indent-tabs-mode: nil
8367End:
8368*/