blob: 975f192368de4707a2301fbbfe5d60bc7592648b [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Fred Drake785d14f2000-05-09 19:54:43 +00004modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
Guido van Rossumd57fd912000-03-10 22:53:23 +00005Unicode Integration Proposal (see file Misc/unicode.txt).
6
Thomas Wouters477c8d52006-05-27 19:21:47 +00007Major speed upgrades to the method implementations at the Reykjavik
8NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
9
Guido van Rossum16b1ad92000-08-03 16:24:25 +000010Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000011
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000012--------------------------------------------------------------------
13The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000014
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000015 Copyright (c) 1999 by Secret Labs AB
16 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000017
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000018By obtaining, using, and/or copying this software and/or its
19associated documentation, you agree that you have read, understood,
20and will comply with the following terms and conditions:
21
22Permission to use, copy, modify, and distribute this software and its
23associated documentation for any purpose and without fee is hereby
24granted, provided that the above copyright notice appears in all
25copies, and that both that copyright notice and this permission notice
26appear in supporting documentation, and that the name of Secret Labs
27AB or the author not be used in advertising or publicity pertaining to
28distribution of the software without specific, written prior
29permission.
30
31SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
32THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
33FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
34ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
35WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
36ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
37OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
38--------------------------------------------------------------------
39
40*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000041
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000042#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000043#include "Python.h"
44
Guido van Rossumd57fd912000-03-10 22:53:23 +000045#include "unicodeobject.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000046#include "ucnhash.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000047
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000048#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000049#include <windows.h>
50#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000051
Guido van Rossumd57fd912000-03-10 22:53:23 +000052/* Limit for the Unicode object free list */
53
54#define MAX_UNICODE_FREELIST_SIZE 1024
55
56/* Limit for the Unicode object free list stay alive optimization.
57
58 The implementation will keep allocated Unicode memory intact for
59 all objects on the free list having a size less than this
Tim Petersced69f82003-09-16 20:30:58 +000060 limit. This reduces malloc() overhead for small Unicode objects.
Guido van Rossumd57fd912000-03-10 22:53:23 +000061
Barry Warsaw51ac5802000-03-20 16:36:48 +000062 At worst this will result in MAX_UNICODE_FREELIST_SIZE *
Guido van Rossumfd4b9572000-04-10 13:51:10 +000063 (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
Guido van Rossumd57fd912000-03-10 22:53:23 +000064 malloc()-overhead) bytes of unused garbage.
65
66 Setting the limit to 0 effectively turns the feature off.
67
Guido van Rossumfd4b9572000-04-10 13:51:10 +000068 Note: This is an experimental feature ! If you get core dumps when
69 using Unicode objects, turn this feature off.
Guido van Rossumd57fd912000-03-10 22:53:23 +000070
71*/
72
Guido van Rossumfd4b9572000-04-10 13:51:10 +000073#define KEEPALIVE_SIZE_LIMIT 9
Guido van Rossumd57fd912000-03-10 22:53:23 +000074
75/* Endianness switches; defaults to little endian */
76
77#ifdef WORDS_BIGENDIAN
78# define BYTEORDER_IS_BIG_ENDIAN
79#else
80# define BYTEORDER_IS_LITTLE_ENDIAN
81#endif
82
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000083/* --- Globals ------------------------------------------------------------
84
85 The globals are initialized by the _PyUnicode_Init() API and should
86 not be used before calling that API.
87
88*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000089
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000090
91#ifdef __cplusplus
92extern "C" {
93#endif
94
Guido van Rossumd57fd912000-03-10 22:53:23 +000095/* Free list for Unicode objects */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000096static PyUnicodeObject *unicode_freelist;
97static int unicode_freelist_size;
Guido van Rossumd57fd912000-03-10 22:53:23 +000098
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000099/* The empty Unicode object is shared to improve performance. */
100static PyUnicodeObject *unicode_empty;
101
102/* Single character Unicode strings in the Latin-1 range are being
103 shared as well. */
104static PyUnicodeObject *unicode_latin1[256];
105
Fred Drakee4315f52000-05-09 19:53:39 +0000106/* Default encoding to use and assume when NULL is passed as encoding
Guido van Rossumf15a29f2007-05-04 00:41:39 +0000107 parameter; it is fixed to "utf-8". Always use the
108 PyUnicode_GetDefaultEncoding() API to access this global. */
109static const char unicode_default_encoding[] = "utf-8";
Fred Drakee4315f52000-05-09 19:53:39 +0000110
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000111Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000112PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000113{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000114#ifdef Py_UNICODE_WIDE
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000115 return 0x10FFFF;
116#else
117 /* This is actually an illegal character, so it should
118 not be passed to unichr. */
119 return 0xFFFF;
120#endif
121}
122
Thomas Wouters477c8d52006-05-27 19:21:47 +0000123/* --- Bloom Filters ----------------------------------------------------- */
124
125/* stuff to implement simple "bloom filters" for Unicode characters.
126 to keep things simple, we use a single bitmask, using the least 5
127 bits from each unicode characters as the bit index. */
128
129/* the linebreak mask is set up by Unicode_Init below */
130
131#define BLOOM_MASK unsigned long
132
133static BLOOM_MASK bloom_linebreak;
134
135#define BLOOM(mask, ch) ((mask & (1 << ((ch) & 0x1F))))
136
137#define BLOOM_LINEBREAK(ch)\
138 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK((ch)))
139
140Py_LOCAL_INLINE(BLOOM_MASK) make_bloom_mask(Py_UNICODE* ptr, Py_ssize_t len)
141{
142 /* calculate simple bloom-style bitmask for a given unicode string */
143
144 long mask;
145 Py_ssize_t i;
146
147 mask = 0;
148 for (i = 0; i < len; i++)
149 mask |= (1 << (ptr[i] & 0x1F));
150
151 return mask;
152}
153
154Py_LOCAL_INLINE(int) unicode_member(Py_UNICODE chr, Py_UNICODE* set, Py_ssize_t setlen)
155{
156 Py_ssize_t i;
157
158 for (i = 0; i < setlen; i++)
159 if (set[i] == chr)
160 return 1;
161
162 return 0;
163}
164
165#define BLOOM_MEMBER(mask, chr, set, setlen)\
166 BLOOM(mask, chr) && unicode_member(chr, set, setlen)
167
Guido van Rossumd57fd912000-03-10 22:53:23 +0000168/* --- Unicode Object ----------------------------------------------------- */
169
170static
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000171int unicode_resize(register PyUnicodeObject *unicode,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000172 Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000173{
174 void *oldstr;
Tim Petersced69f82003-09-16 20:30:58 +0000175
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000176 /* Shortcut if there's nothing much to do. */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000177 if (unicode->length == length)
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000178 goto reset;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000179
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000180 /* Resizing shared object (unicode_empty or single character
181 objects) in-place is not allowed. Use PyUnicode_Resize()
182 instead ! */
Thomas Wouters477c8d52006-05-27 19:21:47 +0000183
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000184 if (unicode == unicode_empty ||
185 (unicode->length == 1 &&
186 unicode->str[0] < 256U &&
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000187 unicode_latin1[unicode->str[0]] == unicode)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000188 PyErr_SetString(PyExc_SystemError,
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000189 "can't resize shared unicode objects");
Guido van Rossumd57fd912000-03-10 22:53:23 +0000190 return -1;
191 }
192
Thomas Wouters477c8d52006-05-27 19:21:47 +0000193 /* We allocate one more byte to make sure the string is Ux0000 terminated.
194 The overallocation is also used by fastsearch, which assumes that it's
195 safe to look at str[length] (without making any assumptions about what
196 it contains). */
197
Guido van Rossumd57fd912000-03-10 22:53:23 +0000198 oldstr = unicode->str;
199 PyMem_RESIZE(unicode->str, Py_UNICODE, length + 1);
200 if (!unicode->str) {
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000201 unicode->str = (Py_UNICODE *)oldstr;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000202 PyErr_NoMemory();
203 return -1;
204 }
205 unicode->str[length] = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000206 unicode->length = length;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000207
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000208 reset:
Guido van Rossumd57fd912000-03-10 22:53:23 +0000209 /* Reset the object caches */
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000210 if (unicode->defenc) {
211 Py_DECREF(unicode->defenc);
212 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000213 }
214 unicode->hash = -1;
Tim Petersced69f82003-09-16 20:30:58 +0000215
Guido van Rossumd57fd912000-03-10 22:53:23 +0000216 return 0;
217}
218
219/* We allocate one more byte to make sure the string is
Tim Petersced69f82003-09-16 20:30:58 +0000220 Ux0000 terminated -- XXX is this needed ?
Guido van Rossumd57fd912000-03-10 22:53:23 +0000221
222 XXX This allocator could further be enhanced by assuring that the
223 free list never reduces its size below 1.
224
225*/
226
227static
Martin v. Löwis18e16552006-02-15 17:27:45 +0000228PyUnicodeObject *_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000229{
230 register PyUnicodeObject *unicode;
231
Thomas Wouters477c8d52006-05-27 19:21:47 +0000232 /* Optimization for empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000233 if (length == 0 && unicode_empty != NULL) {
234 Py_INCREF(unicode_empty);
235 return unicode_empty;
236 }
237
238 /* Unicode freelist & memory allocation */
239 if (unicode_freelist) {
240 unicode = unicode_freelist;
Marc-André Lemburgbea47e72000-06-17 20:31:17 +0000241 unicode_freelist = *(PyUnicodeObject **)unicode;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000242 unicode_freelist_size--;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000243 if (unicode->str) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000244 /* Keep-Alive optimization: we only upsize the buffer,
245 never downsize it. */
Tim Petersced69f82003-09-16 20:30:58 +0000246 if ((unicode->length < length) &&
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +0000247 unicode_resize(unicode, length) < 0) {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000248 PyMem_DEL(unicode->str);
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000249 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000250 }
251 }
Guido van Rossumad98db12001-06-14 17:52:02 +0000252 else {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000253 unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
Guido van Rossumad98db12001-06-14 17:52:02 +0000254 }
255 PyObject_INIT(unicode, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000256 }
257 else {
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000258 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000259 if (unicode == NULL)
260 return NULL;
261 unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
262 }
263
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000264 if (!unicode->str) {
265 PyErr_NoMemory();
Barry Warsaw51ac5802000-03-20 16:36:48 +0000266 goto onError;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000267 }
Jeremy Hyltond8082792003-09-16 19:41:39 +0000268 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +0000269 * the caller fails before initializing str -- unicode_resize()
270 * reads str[0], and the Keep-Alive optimization can keep memory
271 * allocated for str alive across a call to unicode_dealloc(unicode).
272 * We don't want unicode_resize to read uninitialized memory in
273 * that case.
274 */
Jeremy Hyltond8082792003-09-16 19:41:39 +0000275 unicode->str[0] = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000276 unicode->str[length] = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000277 unicode->length = length;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000278 unicode->hash = -1;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000279 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000280 return unicode;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000281
282 onError:
283 _Py_ForgetReference((PyObject *)unicode);
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000284 PyObject_Del(unicode);
Barry Warsaw51ac5802000-03-20 16:36:48 +0000285 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000286}
287
288static
Guido van Rossum9475a232001-10-05 20:51:39 +0000289void unicode_dealloc(register PyUnicodeObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000290{
Guido van Rossum604ddf82001-12-06 20:03:56 +0000291 if (PyUnicode_CheckExact(unicode) &&
292 unicode_freelist_size < MAX_UNICODE_FREELIST_SIZE) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000293 /* Keep-Alive optimization */
294 if (unicode->length >= KEEPALIVE_SIZE_LIMIT) {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000295 PyMem_DEL(unicode->str);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000296 unicode->str = NULL;
297 unicode->length = 0;
298 }
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000299 if (unicode->defenc) {
300 Py_DECREF(unicode->defenc);
301 unicode->defenc = NULL;
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000302 }
303 /* Add to free list */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000304 *(PyUnicodeObject **)unicode = unicode_freelist;
305 unicode_freelist = unicode;
306 unicode_freelist_size++;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000307 }
308 else {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000309 PyMem_DEL(unicode->str);
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000310 Py_XDECREF(unicode->defenc);
Guido van Rossum604ddf82001-12-06 20:03:56 +0000311 unicode->ob_type->tp_free((PyObject *)unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000312 }
313}
314
Martin v. Löwis18e16552006-02-15 17:27:45 +0000315int PyUnicode_Resize(PyObject **unicode, Py_ssize_t length)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000316{
317 register PyUnicodeObject *v;
318
319 /* Argument checks */
320 if (unicode == NULL) {
321 PyErr_BadInternalCall();
322 return -1;
323 }
324 v = (PyUnicodeObject *)*unicode;
Guido van Rossum049cd6b2002-10-11 00:43:48 +0000325 if (v == NULL || !PyUnicode_Check(v) || v->ob_refcnt != 1 || length < 0) {
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000326 PyErr_BadInternalCall();
327 return -1;
328 }
329
330 /* Resizing unicode_empty and single character objects is not
331 possible since these are being shared. We simply return a fresh
332 copy with the same Unicode content. */
Tim Petersced69f82003-09-16 20:30:58 +0000333 if (v->length != length &&
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000334 (v == unicode_empty || v->length == 1)) {
335 PyUnicodeObject *w = _PyUnicode_New(length);
336 if (w == NULL)
337 return -1;
338 Py_UNICODE_COPY(w->str, v->str,
339 length < v->length ? length : v->length);
Raymond Hettingerc8df5782003-03-09 07:30:43 +0000340 Py_DECREF(*unicode);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000341 *unicode = (PyObject *)w;
342 return 0;
343 }
344
345 /* Note that we don't have to modify *unicode for unshared Unicode
346 objects, since we can modify them in-place. */
347 return unicode_resize(v, length);
348}
349
350/* Internal API for use in unicodeobject.c only ! */
351#define _PyUnicode_Resize(unicodevar, length) \
352 PyUnicode_Resize(((PyObject **)(unicodevar)), length)
353
Guido van Rossumd57fd912000-03-10 22:53:23 +0000354PyObject *PyUnicode_FromUnicode(const Py_UNICODE *u,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000355 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000356{
357 PyUnicodeObject *unicode;
358
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000359 /* If the Unicode data is known at construction time, we can apply
360 some optimizations which share commonly used objects. */
361 if (u != NULL) {
362
363 /* Optimization for empty strings */
364 if (size == 0 && unicode_empty != NULL) {
365 Py_INCREF(unicode_empty);
366 return (PyObject *)unicode_empty;
367 }
368
369 /* Single character Unicode objects in the Latin-1 range are
370 shared when using this constructor */
371 if (size == 1 && *u < 256) {
372 unicode = unicode_latin1[*u];
373 if (!unicode) {
374 unicode = _PyUnicode_New(1);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000375 if (!unicode)
376 return NULL;
Marc-André Lemburg8879a332001-06-07 12:26:56 +0000377 unicode->str[0] = *u;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000378 unicode_latin1[*u] = unicode;
379 }
380 Py_INCREF(unicode);
381 return (PyObject *)unicode;
382 }
383 }
Tim Petersced69f82003-09-16 20:30:58 +0000384
Guido van Rossumd57fd912000-03-10 22:53:23 +0000385 unicode = _PyUnicode_New(size);
386 if (!unicode)
387 return NULL;
388
389 /* Copy the Unicode data into the new object */
390 if (u != NULL)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000391 Py_UNICODE_COPY(unicode->str, u, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000392
393 return (PyObject *)unicode;
394}
395
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000396PyObject *PyUnicode_FromString(const char *u)
397{
398 PyUnicodeObject *unicode;
Walter Dörwalda14c4bb2007-05-06 10:00:02 +0000399 size_t size = strlen(u);
400 if (size > PY_SSIZE_T_MAX) {
401 PyErr_SetString(PyExc_OverflowError, "input too long");
402 return NULL;
403 }
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000404
405 /* If the Unicode data is known at construction time, we can apply
406 some optimizations which share commonly used objects. */
407 if (u != NULL) {
408
409 /* Optimization for empty strings */
410 if (size == 0 && unicode_empty != NULL) {
411 Py_INCREF(unicode_empty);
412 return (PyObject *)unicode_empty;
413 }
414
Walter Dörwald071b9da2007-05-05 14:21:20 +0000415 /* Single characters are shared when using this constructor */
416 if (size == 1) {
Walter Dörwaldce32db32007-05-05 14:26:59 +0000417 unicode = unicode_latin1[(int)*u];
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000418 if (!unicode) {
419 unicode = _PyUnicode_New(1);
420 if (!unicode)
421 return NULL;
422 unicode->str[0] = *u;
Walter Dörwaldce32db32007-05-05 14:26:59 +0000423 unicode_latin1[(int)*u] = unicode;
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000424 }
425 Py_INCREF(unicode);
426 return (PyObject *)unicode;
427 }
428 }
429
430 unicode = _PyUnicode_New(size);
431 if (!unicode)
432 return NULL;
433
434 /* Copy the Unicode data into the new object */
435 if (u != NULL) {
Walter Dörwaldce32db32007-05-05 14:26:59 +0000436 Py_UNICODE *p = unicode->str;
437 while ((*p++ = *u++))
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000438 ;
439 }
440
441 return (PyObject *)unicode;
442}
443
Guido van Rossumd57fd912000-03-10 22:53:23 +0000444#ifdef HAVE_WCHAR_H
445
446PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000447 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000448{
449 PyUnicodeObject *unicode;
450
451 if (w == NULL) {
452 PyErr_BadInternalCall();
453 return NULL;
454 }
455
456 unicode = _PyUnicode_New(size);
457 if (!unicode)
458 return NULL;
459
460 /* Copy the wchar_t data into the new object */
461#ifdef HAVE_USABLE_WCHAR_T
462 memcpy(unicode->str, w, size * sizeof(wchar_t));
Tim Petersced69f82003-09-16 20:30:58 +0000463#else
Guido van Rossumd57fd912000-03-10 22:53:23 +0000464 {
465 register Py_UNICODE *u;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000466 register Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000467 u = PyUnicode_AS_UNICODE(unicode);
Marc-André Lemburg204bd6d2004-10-15 07:45:05 +0000468 for (i = size; i > 0; i--)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000469 *u++ = *w++;
470 }
471#endif
472
473 return (PyObject *)unicode;
474}
475
Martin v. Löwis18e16552006-02-15 17:27:45 +0000476Py_ssize_t PyUnicode_AsWideChar(PyUnicodeObject *unicode,
477 wchar_t *w,
478 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000479{
480 if (unicode == NULL) {
481 PyErr_BadInternalCall();
482 return -1;
483 }
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +0000484
485 /* If possible, try to copy the 0-termination as well */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000486 if (size > PyUnicode_GET_SIZE(unicode))
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +0000487 size = PyUnicode_GET_SIZE(unicode) + 1;
488
Guido van Rossumd57fd912000-03-10 22:53:23 +0000489#ifdef HAVE_USABLE_WCHAR_T
490 memcpy(w, unicode->str, size * sizeof(wchar_t));
491#else
492 {
493 register Py_UNICODE *u;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000494 register Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000495 u = PyUnicode_AS_UNICODE(unicode);
Marc-André Lemburg204bd6d2004-10-15 07:45:05 +0000496 for (i = size; i > 0; i--)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000497 *w++ = *u++;
498 }
499#endif
500
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +0000501 if (size > PyUnicode_GET_SIZE(unicode))
502 return PyUnicode_GET_SIZE(unicode);
503 else
Guido van Rossumd57fd912000-03-10 22:53:23 +0000504 return size;
505}
506
507#endif
508
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000509PyObject *PyUnicode_FromOrdinal(int ordinal)
510{
Hye-Shik Chang40574832004-04-06 07:24:51 +0000511 Py_UNICODE s[1];
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000512
513#ifdef Py_UNICODE_WIDE
514 if (ordinal < 0 || ordinal > 0x10ffff) {
515 PyErr_SetString(PyExc_ValueError,
516 "unichr() arg not in range(0x110000) "
517 "(wide Python build)");
518 return NULL;
519 }
520#else
521 if (ordinal < 0 || ordinal > 0xffff) {
522 PyErr_SetString(PyExc_ValueError,
523 "unichr() arg not in range(0x10000) "
524 "(narrow Python build)");
525 return NULL;
526 }
527#endif
528
Hye-Shik Chang40574832004-04-06 07:24:51 +0000529 s[0] = (Py_UNICODE)ordinal;
530 return PyUnicode_FromUnicode(s, 1);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000531}
532
Guido van Rossumd57fd912000-03-10 22:53:23 +0000533PyObject *PyUnicode_FromObject(register PyObject *obj)
534{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000535 /* XXX Perhaps we should make this API an alias of
536 PyObject_Unicode() instead ?! */
537 if (PyUnicode_CheckExact(obj)) {
538 Py_INCREF(obj);
539 return obj;
540 }
541 if (PyUnicode_Check(obj)) {
542 /* For a Unicode subtype that's not a Unicode object,
543 return a true Unicode object with the same data. */
544 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(obj),
545 PyUnicode_GET_SIZE(obj));
546 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000547 return PyUnicode_FromEncodedObject(obj, NULL, "strict");
548}
549
550PyObject *PyUnicode_FromEncodedObject(register PyObject *obj,
551 const char *encoding,
552 const char *errors)
553{
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000554 const char *s = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +0000555 Py_ssize_t len;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000556 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +0000557
Guido van Rossumd57fd912000-03-10 22:53:23 +0000558 if (obj == NULL) {
559 PyErr_BadInternalCall();
560 return NULL;
561 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000562
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000563#if 0
564 /* For b/w compatibility we also accept Unicode objects provided
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +0000565 that no encodings is given and then redirect to
566 PyObject_Unicode() which then applies the additional logic for
567 Unicode subclasses.
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000568
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000569 NOTE: This API should really only be used for object which
570 represent *encoded* Unicode !
571
572 */
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000573 if (PyUnicode_Check(obj)) {
574 if (encoding) {
575 PyErr_SetString(PyExc_TypeError,
576 "decoding Unicode is not supported");
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000577 return NULL;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000578 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000579 return PyObject_Unicode(obj);
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000580 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000581#else
582 if (PyUnicode_Check(obj)) {
583 PyErr_SetString(PyExc_TypeError,
584 "decoding Unicode is not supported");
585 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000586 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000587#endif
588
589 /* Coerce object */
590 if (PyString_Check(obj)) {
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000591 s = PyString_AS_STRING(obj);
592 len = PyString_GET_SIZE(obj);
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000593 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000594 else if (PyObject_AsCharBuffer(obj, &s, &len)) {
595 /* Overwrite the error message with something more useful in
596 case of a TypeError. */
597 if (PyErr_ExceptionMatches(PyExc_TypeError))
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000598 PyErr_Format(PyExc_TypeError,
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000599 "coercing to Unicode: need string or buffer, "
600 "%.80s found",
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000601 obj->ob_type->tp_name);
602 goto onError;
603 }
Tim Petersced69f82003-09-16 20:30:58 +0000604
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000605 /* Convert to Unicode */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000606 if (len == 0) {
607 Py_INCREF(unicode_empty);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000608 v = (PyObject *)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000609 }
Tim Petersced69f82003-09-16 20:30:58 +0000610 else
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000611 v = PyUnicode_Decode(s, len, encoding, errors);
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +0000612
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000613 return v;
614
615 onError:
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000616 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000617}
618
619PyObject *PyUnicode_Decode(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000620 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +0000621 const char *encoding,
622 const char *errors)
623{
624 PyObject *buffer = NULL, *unicode;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000625
626 if (encoding == NULL)
Fred Drakee4315f52000-05-09 19:53:39 +0000627 encoding = PyUnicode_GetDefaultEncoding();
628
629 /* Shortcuts for common default encodings */
630 if (strcmp(encoding, "utf-8") == 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000631 return PyUnicode_DecodeUTF8(s, size, errors);
Fred Drakee4315f52000-05-09 19:53:39 +0000632 else if (strcmp(encoding, "latin-1") == 0)
633 return PyUnicode_DecodeLatin1(s, size, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +0000634#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
635 else if (strcmp(encoding, "mbcs") == 0)
636 return PyUnicode_DecodeMBCS(s, size, errors);
637#endif
Fred Drakee4315f52000-05-09 19:53:39 +0000638 else if (strcmp(encoding, "ascii") == 0)
639 return PyUnicode_DecodeASCII(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000640
641 /* Decode via the codec registry */
642 buffer = PyBuffer_FromMemory((void *)s, size);
643 if (buffer == NULL)
644 goto onError;
645 unicode = PyCodec_Decode(buffer, encoding, errors);
646 if (unicode == NULL)
647 goto onError;
648 if (!PyUnicode_Check(unicode)) {
649 PyErr_Format(PyExc_TypeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000650 "decoder did not return an unicode object (type=%.400s)",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000651 unicode->ob_type->tp_name);
652 Py_DECREF(unicode);
653 goto onError;
654 }
655 Py_DECREF(buffer);
656 return unicode;
Tim Petersced69f82003-09-16 20:30:58 +0000657
Guido van Rossumd57fd912000-03-10 22:53:23 +0000658 onError:
659 Py_XDECREF(buffer);
660 return NULL;
661}
662
Marc-André Lemburgd2d45982004-07-08 17:57:32 +0000663PyObject *PyUnicode_AsDecodedObject(PyObject *unicode,
664 const char *encoding,
665 const char *errors)
666{
667 PyObject *v;
668
669 if (!PyUnicode_Check(unicode)) {
670 PyErr_BadArgument();
671 goto onError;
672 }
673
674 if (encoding == NULL)
675 encoding = PyUnicode_GetDefaultEncoding();
676
677 /* Decode via the codec registry */
678 v = PyCodec_Decode(unicode, encoding, errors);
679 if (v == NULL)
680 goto onError;
681 return v;
682
683 onError:
684 return NULL;
685}
686
Guido van Rossumd57fd912000-03-10 22:53:23 +0000687PyObject *PyUnicode_Encode(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000688 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +0000689 const char *encoding,
690 const char *errors)
691{
692 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +0000693
Guido van Rossumd57fd912000-03-10 22:53:23 +0000694 unicode = PyUnicode_FromUnicode(s, size);
695 if (unicode == NULL)
696 return NULL;
697 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
698 Py_DECREF(unicode);
699 return v;
700}
701
Marc-André Lemburgd2d45982004-07-08 17:57:32 +0000702PyObject *PyUnicode_AsEncodedObject(PyObject *unicode,
703 const char *encoding,
704 const char *errors)
705{
706 PyObject *v;
707
708 if (!PyUnicode_Check(unicode)) {
709 PyErr_BadArgument();
710 goto onError;
711 }
712
713 if (encoding == NULL)
714 encoding = PyUnicode_GetDefaultEncoding();
715
716 /* Encode via the codec registry */
717 v = PyCodec_Encode(unicode, encoding, errors);
718 if (v == NULL)
719 goto onError;
720 return v;
721
722 onError:
723 return NULL;
724}
725
Guido van Rossumd57fd912000-03-10 22:53:23 +0000726PyObject *PyUnicode_AsEncodedString(PyObject *unicode,
727 const char *encoding,
728 const char *errors)
729{
730 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +0000731
Guido van Rossumd57fd912000-03-10 22:53:23 +0000732 if (!PyUnicode_Check(unicode)) {
733 PyErr_BadArgument();
734 goto onError;
735 }
Fred Drakee4315f52000-05-09 19:53:39 +0000736
Tim Petersced69f82003-09-16 20:30:58 +0000737 if (encoding == NULL)
Fred Drakee4315f52000-05-09 19:53:39 +0000738 encoding = PyUnicode_GetDefaultEncoding();
739
740 /* Shortcuts for common default encodings */
741 if (errors == NULL) {
742 if (strcmp(encoding, "utf-8") == 0)
Jeremy Hylton9cea41c2001-05-29 17:13:15 +0000743 return PyUnicode_AsUTF8String(unicode);
Fred Drakee4315f52000-05-09 19:53:39 +0000744 else if (strcmp(encoding, "latin-1") == 0)
745 return PyUnicode_AsLatin1String(unicode);
Mark Hammond0ccda1e2003-07-01 00:13:27 +0000746#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
747 else if (strcmp(encoding, "mbcs") == 0)
748 return PyUnicode_AsMBCSString(unicode);
749#endif
Fred Drakee4315f52000-05-09 19:53:39 +0000750 else if (strcmp(encoding, "ascii") == 0)
751 return PyUnicode_AsASCIIString(unicode);
752 }
Guido van Rossumd57fd912000-03-10 22:53:23 +0000753
754 /* Encode via the codec registry */
755 v = PyCodec_Encode(unicode, encoding, errors);
756 if (v == NULL)
757 goto onError;
Guido van Rossumf15a29f2007-05-04 00:41:39 +0000758 if (!PyBytes_Check(v)) {
759 if (PyString_Check(v)) {
760 /* Old codec, turn it into bytes */
761 PyObject *b = PyBytes_FromObject(v);
762 Py_DECREF(v);
763 return b;
764 }
Guido van Rossumd57fd912000-03-10 22:53:23 +0000765 PyErr_Format(PyExc_TypeError,
Guido van Rossumf15a29f2007-05-04 00:41:39 +0000766 "encoder did not return a bytes object "
767 "(type=%.400s, encoding=%.20s, errors=%.20s)",
768 v->ob_type->tp_name,
769 encoding ? encoding : "NULL",
770 errors ? errors : "NULL");
Guido van Rossumd57fd912000-03-10 22:53:23 +0000771 Py_DECREF(v);
772 goto onError;
773 }
774 return v;
Tim Petersced69f82003-09-16 20:30:58 +0000775
Guido van Rossumd57fd912000-03-10 22:53:23 +0000776 onError:
777 return NULL;
778}
779
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000780PyObject *_PyUnicode_AsDefaultEncodedString(PyObject *unicode,
781 const char *errors)
782{
783 PyObject *v = ((PyUnicodeObject *)unicode)->defenc;
Guido van Rossumf15a29f2007-05-04 00:41:39 +0000784 PyObject *b;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000785 if (v)
786 return v;
Guido van Rossumf15a29f2007-05-04 00:41:39 +0000787 if (errors != NULL)
788 Py_FatalError("non-NULL encoding in _PyUnicode_AsDefaultEncodedString");
789 if (errors == NULL) {
790 b = PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
791 PyUnicode_GET_SIZE(unicode),
792 NULL);
793 }
794 else {
795 b = PyUnicode_AsEncodedString(unicode, NULL, errors);
796 }
797 if (!b)
798 return NULL;
799 v = PyString_FromStringAndSize(PyBytes_AsString(b),
800 PyBytes_Size(b));
801 Py_DECREF(b);
802 if (!errors) {
803 Py_XINCREF(v);
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000804 ((PyUnicodeObject *)unicode)->defenc = v;
Guido van Rossumf15a29f2007-05-04 00:41:39 +0000805 }
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000806 return v;
807}
808
Guido van Rossumd57fd912000-03-10 22:53:23 +0000809Py_UNICODE *PyUnicode_AsUnicode(PyObject *unicode)
810{
811 if (!PyUnicode_Check(unicode)) {
812 PyErr_BadArgument();
813 goto onError;
814 }
815 return PyUnicode_AS_UNICODE(unicode);
816
817 onError:
818 return NULL;
819}
820
Martin v. Löwis18e16552006-02-15 17:27:45 +0000821Py_ssize_t PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000822{
823 if (!PyUnicode_Check(unicode)) {
824 PyErr_BadArgument();
825 goto onError;
826 }
827 return PyUnicode_GET_SIZE(unicode);
828
829 onError:
830 return -1;
831}
832
Thomas Wouters78890102000-07-22 19:25:51 +0000833const char *PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +0000834{
835 return unicode_default_encoding;
836}
837
838int PyUnicode_SetDefaultEncoding(const char *encoding)
839{
Guido van Rossumf15a29f2007-05-04 00:41:39 +0000840 if (strcmp(encoding, unicode_default_encoding) != 0) {
841 PyErr_Format(PyExc_ValueError,
842 "Can only set default encoding to %s",
843 unicode_default_encoding);
844 return -1;
845 }
Fred Drakee4315f52000-05-09 19:53:39 +0000846 return 0;
Fred Drakee4315f52000-05-09 19:53:39 +0000847}
848
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000849/* error handling callback helper:
850 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +0000851 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000852 and adjust various state variables.
853 return 0 on success, -1 on error
854*/
855
856static
857int unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
858 const char *encoding, const char *reason,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000859 const char *input, Py_ssize_t insize, Py_ssize_t *startinpos, Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
860 PyObject **output, Py_ssize_t *outpos, Py_UNICODE **outptr)
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000861{
Martin v. Löwis18e16552006-02-15 17:27:45 +0000862 static char *argparse = "O!n;decoding error handler must return (unicode, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000863
864 PyObject *restuple = NULL;
865 PyObject *repunicode = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +0000866 Py_ssize_t outsize = PyUnicode_GET_SIZE(*output);
867 Py_ssize_t requiredsize;
868 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000869 Py_UNICODE *repptr;
Martin v. Löwis18e16552006-02-15 17:27:45 +0000870 Py_ssize_t repsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000871 int res = -1;
872
873 if (*errorHandler == NULL) {
874 *errorHandler = PyCodec_LookupError(errors);
875 if (*errorHandler == NULL)
876 goto onError;
877 }
878
879 if (*exceptionObject == NULL) {
880 *exceptionObject = PyUnicodeDecodeError_Create(
881 encoding, input, insize, *startinpos, *endinpos, reason);
882 if (*exceptionObject == NULL)
883 goto onError;
884 }
885 else {
886 if (PyUnicodeDecodeError_SetStart(*exceptionObject, *startinpos))
887 goto onError;
888 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, *endinpos))
889 goto onError;
890 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
891 goto onError;
892 }
893
894 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
895 if (restuple == NULL)
896 goto onError;
897 if (!PyTuple_Check(restuple)) {
898 PyErr_Format(PyExc_TypeError, &argparse[4]);
899 goto onError;
900 }
901 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
902 goto onError;
903 if (newpos<0)
Walter Dörwald2e0b18a2003-01-31 17:19:08 +0000904 newpos = insize+newpos;
905 if (newpos<0 || newpos>insize) {
Martin v. Löwis2c95cc62006-02-16 06:54:25 +0000906 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
Walter Dörwald2e0b18a2003-01-31 17:19:08 +0000907 goto onError;
908 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000909
910 /* need more space? (at least enough for what we
911 have+the replacement+the rest of the string (starting
912 at the new input position), so we won't have to check space
913 when there are no errors in the rest of the string) */
914 repptr = PyUnicode_AS_UNICODE(repunicode);
915 repsize = PyUnicode_GET_SIZE(repunicode);
916 requiredsize = *outpos + repsize + insize-newpos;
917 if (requiredsize > outsize) {
918 if (requiredsize<2*outsize)
919 requiredsize = 2*outsize;
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +0000920 if (PyUnicode_Resize(output, requiredsize) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000921 goto onError;
922 *outptr = PyUnicode_AS_UNICODE(*output) + *outpos;
923 }
924 *endinpos = newpos;
925 *inptr = input + newpos;
926 Py_UNICODE_COPY(*outptr, repptr, repsize);
927 *outptr += repsize;
928 *outpos += repsize;
929 /* we made it! */
930 res = 0;
931
932 onError:
933 Py_XDECREF(restuple);
934 return res;
935}
936
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000937/* --- UTF-7 Codec -------------------------------------------------------- */
938
939/* see RFC2152 for details */
940
Tim Petersced69f82003-09-16 20:30:58 +0000941static
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000942char utf7_special[128] = {
943 /* indicate whether a UTF-7 character is special i.e. cannot be directly
944 encoded:
945 0 - not special
946 1 - special
947 2 - whitespace (optional)
948 3 - RFC2152 Set O (optional) */
949 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 2, 1, 1,
950 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
951 2, 3, 3, 3, 3, 3, 3, 0, 0, 0, 3, 1, 0, 0, 0, 1,
952 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 0,
953 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
954 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 1, 3, 3, 3,
955 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
956 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 1, 1,
957
958};
959
Marc-André Lemburge115ec82005-10-19 22:33:31 +0000960/* Note: The comparison (c) <= 0 is a trick to work-around gcc
961 warnings about the comparison always being false; since
962 utf7_special[0] is 1, we can safely make that one comparison
963 true */
964
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000965#define SPECIAL(c, encodeO, encodeWS) \
Marc-André Lemburge115ec82005-10-19 22:33:31 +0000966 ((c) > 127 || (c) <= 0 || utf7_special[(c)] == 1 || \
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +0000967 (encodeWS && (utf7_special[(c)] == 2)) || \
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000968 (encodeO && (utf7_special[(c)] == 3)))
969
Marc-André Lemburge115ec82005-10-19 22:33:31 +0000970#define B64(n) \
971 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
972#define B64CHAR(c) \
973 (isalnum(c) || (c) == '+' || (c) == '/')
974#define UB64(c) \
975 ((c) == '+' ? 62 : (c) == '/' ? 63 : (c) >= 'a' ? \
976 (c) - 71 : (c) >= 'A' ? (c) - 65 : (c) + 4 )
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000977
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +0000978#define ENCODE(out, ch, bits) \
979 while (bits >= 6) { \
980 *out++ = B64(ch >> (bits-6)); \
981 bits -= 6; \
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000982 }
983
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +0000984#define DECODE(out, ch, bits, surrogate) \
985 while (bits >= 16) { \
986 Py_UNICODE outCh = (Py_UNICODE) ((ch >> (bits-16)) & 0xffff); \
987 bits -= 16; \
988 if (surrogate) { \
Marc-André Lemburge115ec82005-10-19 22:33:31 +0000989 /* We have already generated an error for the high surrogate \
990 so let's not bother seeing if the low surrogate is correct or not */ \
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +0000991 surrogate = 0; \
992 } else if (0xDC00 <= outCh && outCh <= 0xDFFF) { \
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000993 /* This is a surrogate pair. Unfortunately we can't represent \
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +0000994 it in a 16-bit character */ \
995 surrogate = 1; \
996 errmsg = "code pairs are not supported"; \
997 goto utf7Error; \
998 } else { \
999 *out++ = outCh; \
1000 } \
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001001 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001002
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001003PyObject *PyUnicode_DecodeUTF7(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001004 Py_ssize_t size,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001005 const char *errors)
1006{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001007 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001008 Py_ssize_t startinpos;
1009 Py_ssize_t endinpos;
1010 Py_ssize_t outpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001011 const char *e;
1012 PyUnicodeObject *unicode;
1013 Py_UNICODE *p;
1014 const char *errmsg = "";
1015 int inShift = 0;
1016 unsigned int bitsleft = 0;
1017 unsigned long charsleft = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001018 int surrogate = 0;
1019 PyObject *errorHandler = NULL;
1020 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001021
1022 unicode = _PyUnicode_New(size);
1023 if (!unicode)
1024 return NULL;
1025 if (size == 0)
1026 return (PyObject *)unicode;
1027
1028 p = unicode->str;
1029 e = s + size;
1030
1031 while (s < e) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001032 Py_UNICODE ch;
1033 restart:
1034 ch = *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001035
1036 if (inShift) {
1037 if ((ch == '-') || !B64CHAR(ch)) {
1038 inShift = 0;
1039 s++;
Tim Petersced69f82003-09-16 20:30:58 +00001040
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001041 /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
1042 if (bitsleft >= 6) {
1043 /* The shift sequence has a partial character in it. If
1044 bitsleft < 6 then we could just classify it as padding
1045 but that is not the case here */
1046
1047 errmsg = "partial character in shift sequence";
Tim Petersced69f82003-09-16 20:30:58 +00001048 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001049 }
1050 /* According to RFC2152 the remaining bits should be zero. We
Tim Petersced69f82003-09-16 20:30:58 +00001051 choose to signal an error/insert a replacement character
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001052 here so indicate the potential of a misencoded character. */
1053
1054 /* On x86, a << b == a << (b%32) so make sure that bitsleft != 0 */
1055 if (bitsleft && charsleft << (sizeof(charsleft) * 8 - bitsleft)) {
1056 errmsg = "non-zero padding bits in shift sequence";
Tim Petersced69f82003-09-16 20:30:58 +00001057 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001058 }
1059
1060 if (ch == '-') {
1061 if ((s < e) && (*(s) == '-')) {
Tim Petersced69f82003-09-16 20:30:58 +00001062 *p++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001063 inShift = 1;
1064 }
1065 } else if (SPECIAL(ch,0,0)) {
1066 errmsg = "unexpected special character";
Tim Petersced69f82003-09-16 20:30:58 +00001067 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001068 } else {
1069 *p++ = ch;
1070 }
1071 } else {
1072 charsleft = (charsleft << 6) | UB64(ch);
1073 bitsleft += 6;
1074 s++;
1075 /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
1076 }
1077 }
1078 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001079 startinpos = s-starts;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001080 s++;
1081 if (s < e && *s == '-') {
1082 s++;
1083 *p++ = '+';
1084 } else
1085 {
1086 inShift = 1;
1087 bitsleft = 0;
1088 }
1089 }
1090 else if (SPECIAL(ch,0,0)) {
1091 errmsg = "unexpected special character";
1092 s++;
Tim Petersced69f82003-09-16 20:30:58 +00001093 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001094 }
1095 else {
1096 *p++ = ch;
1097 s++;
1098 }
1099 continue;
1100 utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001101 outpos = p-PyUnicode_AS_UNICODE(unicode);
1102 endinpos = s-starts;
1103 if (unicode_decode_call_errorhandler(
1104 errors, &errorHandler,
1105 "utf7", errmsg,
1106 starts, size, &startinpos, &endinpos, &exc, &s,
1107 (PyObject **)&unicode, &outpos, &p))
1108 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001109 }
1110
1111 if (inShift) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001112 outpos = p-PyUnicode_AS_UNICODE(unicode);
1113 endinpos = size;
1114 if (unicode_decode_call_errorhandler(
1115 errors, &errorHandler,
1116 "utf7", "unterminated shift sequence",
1117 starts, size, &startinpos, &endinpos, &exc, &s,
1118 (PyObject **)&unicode, &outpos, &p))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001119 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001120 if (s < e)
1121 goto restart;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001122 }
1123
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00001124 if (_PyUnicode_Resize(&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001125 goto onError;
1126
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001127 Py_XDECREF(errorHandler);
1128 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001129 return (PyObject *)unicode;
1130
1131onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001132 Py_XDECREF(errorHandler);
1133 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001134 Py_DECREF(unicode);
1135 return NULL;
1136}
1137
1138
1139PyObject *PyUnicode_EncodeUTF7(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001140 Py_ssize_t size,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001141 int encodeSetO,
1142 int encodeWhiteSpace,
1143 const char *errors)
1144{
1145 PyObject *v;
1146 /* It might be possible to tighten this worst case */
Martin v. Löwis18e16552006-02-15 17:27:45 +00001147 Py_ssize_t cbAllocated = 5 * size;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001148 int inShift = 0;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001149 Py_ssize_t i = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001150 unsigned int bitsleft = 0;
1151 unsigned long charsleft = 0;
1152 char * out;
1153 char * start;
1154
1155 if (size == 0)
Walter Dörwald51ab4142007-05-05 14:43:36 +00001156 return PyBytes_FromStringAndSize(NULL, 0);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001157
Walter Dörwald51ab4142007-05-05 14:43:36 +00001158 v = PyBytes_FromStringAndSize(NULL, cbAllocated);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001159 if (v == NULL)
1160 return NULL;
1161
Walter Dörwald51ab4142007-05-05 14:43:36 +00001162 start = out = PyBytes_AS_STRING(v);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001163 for (;i < size; ++i) {
1164 Py_UNICODE ch = s[i];
1165
1166 if (!inShift) {
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001167 if (ch == '+') {
1168 *out++ = '+';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001169 *out++ = '-';
1170 } else if (SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
1171 charsleft = ch;
1172 bitsleft = 16;
1173 *out++ = '+';
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001174 /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001175 inShift = bitsleft > 0;
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001176 } else {
1177 *out++ = (char) ch;
1178 }
1179 } else {
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001180 if (!SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
1181 *out++ = B64(charsleft << (6-bitsleft));
1182 charsleft = 0;
1183 bitsleft = 0;
1184 /* Characters not in the BASE64 set implicitly unshift the sequence
1185 so no '-' is required, except if the character is itself a '-' */
1186 if (B64CHAR(ch) || ch == '-') {
1187 *out++ = '-';
1188 }
1189 inShift = 0;
1190 *out++ = (char) ch;
1191 } else {
1192 bitsleft += 16;
1193 charsleft = (charsleft << 16) | ch;
1194 /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
1195
1196 /* If the next character is special then we dont' need to terminate
Tim Petersced69f82003-09-16 20:30:58 +00001197 the shift sequence. If the next character is not a BASE64 character
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001198 or '-' then the shift sequence will be terminated implicitly and we
1199 don't have to insert a '-'. */
1200
1201 if (bitsleft == 0) {
1202 if (i + 1 < size) {
1203 Py_UNICODE ch2 = s[i+1];
1204
1205 if (SPECIAL(ch2, encodeSetO, encodeWhiteSpace)) {
Tim Petersced69f82003-09-16 20:30:58 +00001206
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001207 } else if (B64CHAR(ch2) || ch2 == '-') {
1208 *out++ = '-';
1209 inShift = 0;
1210 } else {
1211 inShift = 0;
1212 }
1213
1214 }
1215 else {
1216 *out++ = '-';
1217 inShift = 0;
1218 }
1219 }
Tim Petersced69f82003-09-16 20:30:58 +00001220 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001221 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001222 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001223 if (bitsleft) {
1224 *out++= B64(charsleft << (6-bitsleft) );
1225 *out++ = '-';
1226 }
1227
Walter Dörwald51ab4142007-05-05 14:43:36 +00001228 if (PyBytes_Resize(v, out - start)) {
1229 Py_DECREF(v);
1230 return NULL;
1231 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001232 return v;
1233}
1234
1235#undef SPECIAL
1236#undef B64
1237#undef B64CHAR
1238#undef UB64
1239#undef ENCODE
1240#undef DECODE
1241
Guido van Rossumd57fd912000-03-10 22:53:23 +00001242/* --- UTF-8 Codec -------------------------------------------------------- */
1243
Tim Petersced69f82003-09-16 20:30:58 +00001244static
Guido van Rossumd57fd912000-03-10 22:53:23 +00001245char utf8_code_length[256] = {
1246 /* Map UTF-8 encoded prefix byte to sequence length. zero means
1247 illegal prefix. see RFC 2279 for details */
1248 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1249 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1250 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1251 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1252 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1253 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1254 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1255 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1256 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1257 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1258 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1259 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1260 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1261 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1262 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
1263 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0
1264};
1265
Guido van Rossumd57fd912000-03-10 22:53:23 +00001266PyObject *PyUnicode_DecodeUTF8(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001267 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00001268 const char *errors)
1269{
Walter Dörwald69652032004-09-07 20:24:22 +00001270 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
1271}
1272
1273PyObject *PyUnicode_DecodeUTF8Stateful(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001274 Py_ssize_t size,
Walter Dörwald69652032004-09-07 20:24:22 +00001275 const char *errors,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001276 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00001277{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001278 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001279 int n;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001280 Py_ssize_t startinpos;
1281 Py_ssize_t endinpos;
1282 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001283 const char *e;
1284 PyUnicodeObject *unicode;
1285 Py_UNICODE *p;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001286 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001287 PyObject *errorHandler = NULL;
1288 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001289
1290 /* Note: size will always be longer than the resulting Unicode
1291 character count */
1292 unicode = _PyUnicode_New(size);
1293 if (!unicode)
1294 return NULL;
Walter Dörwald69652032004-09-07 20:24:22 +00001295 if (size == 0) {
1296 if (consumed)
1297 *consumed = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001298 return (PyObject *)unicode;
Walter Dörwald69652032004-09-07 20:24:22 +00001299 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001300
1301 /* Unpack UTF-8 encoded data */
1302 p = unicode->str;
1303 e = s + size;
1304
1305 while (s < e) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001306 Py_UCS4 ch = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001307
1308 if (ch < 0x80) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001309 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001310 s++;
1311 continue;
1312 }
1313
1314 n = utf8_code_length[ch];
1315
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001316 if (s + n > e) {
Walter Dörwald69652032004-09-07 20:24:22 +00001317 if (consumed)
1318 break;
1319 else {
1320 errmsg = "unexpected end of data";
1321 startinpos = s-starts;
1322 endinpos = size;
1323 goto utf8Error;
1324 }
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001325 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001326
1327 switch (n) {
1328
1329 case 0:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001330 errmsg = "unexpected code byte";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001331 startinpos = s-starts;
1332 endinpos = startinpos+1;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001333 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001334
1335 case 1:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001336 errmsg = "internal error";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001337 startinpos = s-starts;
1338 endinpos = startinpos+1;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001339 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001340
1341 case 2:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001342 if ((s[1] & 0xc0) != 0x80) {
1343 errmsg = "invalid data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001344 startinpos = s-starts;
1345 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001346 goto utf8Error;
1347 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001348 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001349 if (ch < 0x80) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001350 startinpos = s-starts;
1351 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001352 errmsg = "illegal encoding";
1353 goto utf8Error;
1354 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001355 else
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001356 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001357 break;
1358
1359 case 3:
Tim Petersced69f82003-09-16 20:30:58 +00001360 if ((s[1] & 0xc0) != 0x80 ||
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001361 (s[2] & 0xc0) != 0x80) {
1362 errmsg = "invalid data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001363 startinpos = s-starts;
1364 endinpos = startinpos+3;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001365 goto utf8Error;
1366 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001367 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001368 if (ch < 0x0800) {
1369 /* Note: UTF-8 encodings of surrogates are considered
Tim Petersced69f82003-09-16 20:30:58 +00001370 legal UTF-8 sequences;
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001371
1372 XXX For wide builds (UCS-4) we should probably try
1373 to recombine the surrogates into a single code
1374 unit.
1375 */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001376 errmsg = "illegal encoding";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001377 startinpos = s-starts;
1378 endinpos = startinpos+3;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001379 goto utf8Error;
1380 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001381 else
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001382 *p++ = (Py_UNICODE)ch;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001383 break;
1384
1385 case 4:
1386 if ((s[1] & 0xc0) != 0x80 ||
1387 (s[2] & 0xc0) != 0x80 ||
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001388 (s[3] & 0xc0) != 0x80) {
1389 errmsg = "invalid data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001390 startinpos = s-starts;
1391 endinpos = startinpos+4;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001392 goto utf8Error;
1393 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001394 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
1395 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
1396 /* validate and convert to UTF-16 */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001397 if ((ch < 0x10000) /* minimum value allowed for 4
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001398 byte encoding */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001399 || (ch > 0x10ffff)) /* maximum value allowed for
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001400 UTF-16 */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001401 {
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001402 errmsg = "illegal encoding";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001403 startinpos = s-starts;
1404 endinpos = startinpos+4;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001405 goto utf8Error;
1406 }
Fredrik Lundh8f455852001-06-27 18:59:43 +00001407#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001408 *p++ = (Py_UNICODE)ch;
1409#else
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001410 /* compute and append the two surrogates: */
Tim Petersced69f82003-09-16 20:30:58 +00001411
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001412 /* translate from 10000..10FFFF to 0..FFFF */
1413 ch -= 0x10000;
Tim Petersced69f82003-09-16 20:30:58 +00001414
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001415 /* high surrogate = top 10 bits added to D800 */
1416 *p++ = (Py_UNICODE)(0xD800 + (ch >> 10));
Tim Petersced69f82003-09-16 20:30:58 +00001417
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001418 /* low surrogate = bottom 10 bits added to DC00 */
Fredrik Lundh45714e92001-06-26 16:39:36 +00001419 *p++ = (Py_UNICODE)(0xDC00 + (ch & 0x03FF));
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001420#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00001421 break;
1422
1423 default:
1424 /* Other sizes are only needed for UCS-4 */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001425 errmsg = "unsupported Unicode code range";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001426 startinpos = s-starts;
1427 endinpos = startinpos+n;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001428 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001429 }
1430 s += n;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001431 continue;
Tim Petersced69f82003-09-16 20:30:58 +00001432
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001433 utf8Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001434 outpos = p-PyUnicode_AS_UNICODE(unicode);
1435 if (unicode_decode_call_errorhandler(
1436 errors, &errorHandler,
1437 "utf8", errmsg,
1438 starts, size, &startinpos, &endinpos, &exc, &s,
1439 (PyObject **)&unicode, &outpos, &p))
1440 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001441 }
Walter Dörwald69652032004-09-07 20:24:22 +00001442 if (consumed)
1443 *consumed = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001444
1445 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00001446 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001447 goto onError;
1448
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001449 Py_XDECREF(errorHandler);
1450 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001451 return (PyObject *)unicode;
1452
1453onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001454 Py_XDECREF(errorHandler);
1455 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001456 Py_DECREF(unicode);
1457 return NULL;
1458}
1459
Tim Peters602f7402002-04-27 18:03:26 +00001460/* Allocation strategy: if the string is short, convert into a stack buffer
1461 and allocate exactly as much space needed at the end. Else allocate the
1462 maximum possible needed (4 result bytes per Unicode character), and return
1463 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001464*/
Tim Peters7e3d9612002-04-21 03:26:37 +00001465PyObject *
1466PyUnicode_EncodeUTF8(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001467 Py_ssize_t size,
Tim Peters7e3d9612002-04-21 03:26:37 +00001468 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001469{
Tim Peters602f7402002-04-27 18:03:26 +00001470#define MAX_SHORT_UNICHARS 300 /* largest size we'll do on the stack */
Tim Peters0eca65c2002-04-21 17:28:06 +00001471
Martin v. Löwis18e16552006-02-15 17:27:45 +00001472 Py_ssize_t i; /* index into s of next input byte */
Tim Peters602f7402002-04-27 18:03:26 +00001473 PyObject *v; /* result string object */
1474 char *p; /* next free byte in output buffer */
Martin v. Löwis18e16552006-02-15 17:27:45 +00001475 Py_ssize_t nallocated; /* number of result bytes allocated */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001476 Py_ssize_t nneeded; /* number of result bytes needed */
Tim Peters602f7402002-04-27 18:03:26 +00001477 char stackbuf[MAX_SHORT_UNICHARS * 4];
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001478
Tim Peters602f7402002-04-27 18:03:26 +00001479 assert(s != NULL);
1480 assert(size >= 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001481
Tim Peters602f7402002-04-27 18:03:26 +00001482 if (size <= MAX_SHORT_UNICHARS) {
1483 /* Write into the stack buffer; nallocated can't overflow.
1484 * At the end, we'll allocate exactly as much heap space as it
1485 * turns out we need.
1486 */
1487 nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int);
1488 v = NULL; /* will allocate after we're done */
1489 p = stackbuf;
1490 }
1491 else {
1492 /* Overallocate on the heap, and give the excess back at the end. */
1493 nallocated = size * 4;
1494 if (nallocated / 4 != size) /* overflow! */
1495 return PyErr_NoMemory();
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001496 v = PyBytes_FromStringAndSize(NULL, nallocated);
Tim Peters602f7402002-04-27 18:03:26 +00001497 if (v == NULL)
1498 return NULL;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001499 p = PyBytes_AS_STRING(v);
Tim Peters602f7402002-04-27 18:03:26 +00001500 }
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001501
Tim Peters602f7402002-04-27 18:03:26 +00001502 for (i = 0; i < size;) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001503 Py_UCS4 ch = s[i++];
Marc-André Lemburg3688a882002-02-06 18:09:02 +00001504
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001505 if (ch < 0x80)
Tim Peters602f7402002-04-27 18:03:26 +00001506 /* Encode ASCII */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001507 *p++ = (char) ch;
Marc-André Lemburg3688a882002-02-06 18:09:02 +00001508
Guido van Rossumd57fd912000-03-10 22:53:23 +00001509 else if (ch < 0x0800) {
Tim Peters602f7402002-04-27 18:03:26 +00001510 /* Encode Latin-1 */
Marc-André Lemburgdc724d62002-02-06 18:20:19 +00001511 *p++ = (char)(0xc0 | (ch >> 6));
1512 *p++ = (char)(0x80 | (ch & 0x3f));
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001513 }
Marc-André Lemburg3688a882002-02-06 18:09:02 +00001514 else {
Tim Peters602f7402002-04-27 18:03:26 +00001515 /* Encode UCS2 Unicode ordinals */
1516 if (ch < 0x10000) {
1517 /* Special case: check for high surrogate */
1518 if (0xD800 <= ch && ch <= 0xDBFF && i != size) {
1519 Py_UCS4 ch2 = s[i];
1520 /* Check for low surrogate and combine the two to
1521 form a UCS4 value */
1522 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001523 ch = ((ch - 0xD800) << 10 | (ch2 - 0xDC00)) + 0x10000;
Tim Peters602f7402002-04-27 18:03:26 +00001524 i++;
1525 goto encodeUCS4;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001526 }
Tim Peters602f7402002-04-27 18:03:26 +00001527 /* Fall through: handles isolated high surrogates */
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001528 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001529 *p++ = (char)(0xe0 | (ch >> 12));
Tim Peters602f7402002-04-27 18:03:26 +00001530 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
1531 *p++ = (char)(0x80 | (ch & 0x3f));
1532 continue;
1533 }
1534encodeUCS4:
1535 /* Encode UCS4 Unicode ordinals */
1536 *p++ = (char)(0xf0 | (ch >> 18));
1537 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
1538 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
1539 *p++ = (char)(0x80 | (ch & 0x3f));
1540 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001541 }
Tim Peters0eca65c2002-04-21 17:28:06 +00001542
Tim Peters602f7402002-04-27 18:03:26 +00001543 if (v == NULL) {
1544 /* This was stack allocated. */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001545 nneeded = p - stackbuf;
Tim Peters602f7402002-04-27 18:03:26 +00001546 assert(nneeded <= nallocated);
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001547 v = PyBytes_FromStringAndSize(stackbuf, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00001548 }
1549 else {
1550 /* Cut back to size actually needed. */
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001551 nneeded = p - PyBytes_AS_STRING(v);
Tim Peters602f7402002-04-27 18:03:26 +00001552 assert(nneeded <= nallocated);
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001553 PyBytes_Resize(v, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00001554 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001555 return v;
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001556
Tim Peters602f7402002-04-27 18:03:26 +00001557#undef MAX_SHORT_UNICHARS
Guido van Rossumd57fd912000-03-10 22:53:23 +00001558}
1559
Guido van Rossumd57fd912000-03-10 22:53:23 +00001560PyObject *PyUnicode_AsUTF8String(PyObject *unicode)
1561{
Guido van Rossumd57fd912000-03-10 22:53:23 +00001562 if (!PyUnicode_Check(unicode)) {
1563 PyErr_BadArgument();
1564 return NULL;
1565 }
Barry Warsaw2dd4abf2000-08-18 06:58:15 +00001566 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
1567 PyUnicode_GET_SIZE(unicode),
1568 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001569}
1570
1571/* --- UTF-16 Codec ------------------------------------------------------- */
1572
Tim Peters772747b2001-08-09 22:21:55 +00001573PyObject *
1574PyUnicode_DecodeUTF16(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001575 Py_ssize_t size,
Tim Peters772747b2001-08-09 22:21:55 +00001576 const char *errors,
1577 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001578{
Walter Dörwald69652032004-09-07 20:24:22 +00001579 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
1580}
1581
1582PyObject *
1583PyUnicode_DecodeUTF16Stateful(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001584 Py_ssize_t size,
Walter Dörwald69652032004-09-07 20:24:22 +00001585 const char *errors,
1586 int *byteorder,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001587 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00001588{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001589 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001590 Py_ssize_t startinpos;
1591 Py_ssize_t endinpos;
1592 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001593 PyUnicodeObject *unicode;
1594 Py_UNICODE *p;
Tim Peters772747b2001-08-09 22:21:55 +00001595 const unsigned char *q, *e;
1596 int bo = 0; /* assume native ordering by default */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001597 const char *errmsg = "";
Tim Peters772747b2001-08-09 22:21:55 +00001598 /* Offsets from q for retrieving byte pairs in the right order. */
1599#ifdef BYTEORDER_IS_LITTLE_ENDIAN
1600 int ihi = 1, ilo = 0;
1601#else
1602 int ihi = 0, ilo = 1;
1603#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001604 PyObject *errorHandler = NULL;
1605 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001606
1607 /* Note: size will always be longer than the resulting Unicode
1608 character count */
1609 unicode = _PyUnicode_New(size);
1610 if (!unicode)
1611 return NULL;
1612 if (size == 0)
1613 return (PyObject *)unicode;
1614
1615 /* Unpack UTF-16 encoded data */
1616 p = unicode->str;
Tim Peters772747b2001-08-09 22:21:55 +00001617 q = (unsigned char *)s;
1618 e = q + size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001619
1620 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00001621 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001622
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001623 /* Check for BOM marks (U+FEFF) in the input and adjust current
1624 byte order setting accordingly. In native mode, the leading BOM
1625 mark is skipped, in all other modes, it is copied to the output
1626 stream as-is (giving a ZWNBSP character). */
1627 if (bo == 0) {
Walter Dörwald69652032004-09-07 20:24:22 +00001628 if (size >= 2) {
1629 const Py_UNICODE bom = (q[ihi] << 8) | q[ilo];
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001630#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Walter Dörwald69652032004-09-07 20:24:22 +00001631 if (bom == 0xFEFF) {
1632 q += 2;
1633 bo = -1;
1634 }
1635 else if (bom == 0xFFFE) {
1636 q += 2;
1637 bo = 1;
1638 }
Tim Petersced69f82003-09-16 20:30:58 +00001639#else
Walter Dörwald69652032004-09-07 20:24:22 +00001640 if (bom == 0xFEFF) {
1641 q += 2;
1642 bo = 1;
1643 }
1644 else if (bom == 0xFFFE) {
1645 q += 2;
1646 bo = -1;
1647 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001648#endif
Walter Dörwald69652032004-09-07 20:24:22 +00001649 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001650 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001651
Tim Peters772747b2001-08-09 22:21:55 +00001652 if (bo == -1) {
1653 /* force LE */
1654 ihi = 1;
1655 ilo = 0;
1656 }
1657 else if (bo == 1) {
1658 /* force BE */
1659 ihi = 0;
1660 ilo = 1;
1661 }
1662
1663 while (q < e) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001664 Py_UNICODE ch;
Walter Dörwald69652032004-09-07 20:24:22 +00001665 /* remaining bytes at the end? (size should be even) */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001666 if (e-q<2) {
Walter Dörwald69652032004-09-07 20:24:22 +00001667 if (consumed)
1668 break;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001669 errmsg = "truncated data";
1670 startinpos = ((const char *)q)-starts;
1671 endinpos = ((const char *)e)-starts;
1672 goto utf16Error;
1673 /* The remaining input chars are ignored if the callback
1674 chooses to skip the input */
1675 }
1676 ch = (q[ihi] << 8) | q[ilo];
1677
Tim Peters772747b2001-08-09 22:21:55 +00001678 q += 2;
1679
Guido van Rossumd57fd912000-03-10 22:53:23 +00001680 if (ch < 0xD800 || ch > 0xDFFF) {
1681 *p++ = ch;
1682 continue;
1683 }
1684
1685 /* UTF-16 code pair: */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001686 if (q >= e) {
1687 errmsg = "unexpected end of data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001688 startinpos = (((const char *)q)-2)-starts;
1689 endinpos = ((const char *)e)-starts;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001690 goto utf16Error;
1691 }
Martin v. Löwisac93bc22001-06-26 22:43:40 +00001692 if (0xD800 <= ch && ch <= 0xDBFF) {
Tim Peters772747b2001-08-09 22:21:55 +00001693 Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo];
1694 q += 2;
Martin v. Löwisac93bc22001-06-26 22:43:40 +00001695 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Fredrik Lundh8f455852001-06-27 18:59:43 +00001696#ifndef Py_UNICODE_WIDE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001697 *p++ = ch;
1698 *p++ = ch2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001699#else
1700 *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001701#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001702 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001703 }
1704 else {
1705 errmsg = "illegal UTF-16 surrogate";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001706 startinpos = (((const char *)q)-4)-starts;
1707 endinpos = startinpos+2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001708 goto utf16Error;
1709 }
1710
Guido van Rossumd57fd912000-03-10 22:53:23 +00001711 }
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001712 errmsg = "illegal encoding";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001713 startinpos = (((const char *)q)-2)-starts;
1714 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001715 /* Fall through to report the error */
1716
1717 utf16Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001718 outpos = p-PyUnicode_AS_UNICODE(unicode);
1719 if (unicode_decode_call_errorhandler(
1720 errors, &errorHandler,
1721 "utf16", errmsg,
1722 starts, size, &startinpos, &endinpos, &exc, (const char **)&q,
1723 (PyObject **)&unicode, &outpos, &p))
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001724 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001725 }
1726
1727 if (byteorder)
1728 *byteorder = bo;
1729
Walter Dörwald69652032004-09-07 20:24:22 +00001730 if (consumed)
1731 *consumed = (const char *)q-starts;
1732
Guido van Rossumd57fd912000-03-10 22:53:23 +00001733 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00001734 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001735 goto onError;
1736
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001737 Py_XDECREF(errorHandler);
1738 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001739 return (PyObject *)unicode;
1740
1741onError:
1742 Py_DECREF(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001743 Py_XDECREF(errorHandler);
1744 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001745 return NULL;
1746}
1747
Tim Peters772747b2001-08-09 22:21:55 +00001748PyObject *
1749PyUnicode_EncodeUTF16(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001750 Py_ssize_t size,
Tim Peters772747b2001-08-09 22:21:55 +00001751 const char *errors,
1752 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001753{
1754 PyObject *v;
Tim Peters772747b2001-08-09 22:21:55 +00001755 unsigned char *p;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00001756#ifdef Py_UNICODE_WIDE
Tim Peters772747b2001-08-09 22:21:55 +00001757 int i, pairs;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00001758#else
1759 const int pairs = 0;
1760#endif
Tim Peters772747b2001-08-09 22:21:55 +00001761 /* Offsets from p for storing byte pairs in the right order. */
1762#ifdef BYTEORDER_IS_LITTLE_ENDIAN
1763 int ihi = 1, ilo = 0;
1764#else
1765 int ihi = 0, ilo = 1;
1766#endif
1767
1768#define STORECHAR(CH) \
1769 do { \
1770 p[ihi] = ((CH) >> 8) & 0xff; \
1771 p[ilo] = (CH) & 0xff; \
1772 p += 2; \
1773 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001774
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00001775#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001776 for (i = pairs = 0; i < size; i++)
1777 if (s[i] >= 0x10000)
1778 pairs++;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00001779#endif
Walter Dörwald3cc34522007-05-04 10:48:27 +00001780 v = PyBytes_FromStringAndSize(NULL,
Tim Peters772747b2001-08-09 22:21:55 +00001781 2 * (size + pairs + (byteorder == 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +00001782 if (v == NULL)
1783 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001784
Walter Dörwald3cc34522007-05-04 10:48:27 +00001785 p = (unsigned char *)PyBytes_AS_STRING(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001786 if (byteorder == 0)
Tim Peters772747b2001-08-09 22:21:55 +00001787 STORECHAR(0xFEFF);
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00001788 if (size == 0)
Marc-André Lemburgb7520772000-08-14 11:29:19 +00001789 return v;
Tim Peters772747b2001-08-09 22:21:55 +00001790
1791 if (byteorder == -1) {
1792 /* force LE */
1793 ihi = 1;
1794 ilo = 0;
1795 }
1796 else if (byteorder == 1) {
1797 /* force BE */
1798 ihi = 0;
1799 ilo = 1;
1800 }
1801
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001802 while (size-- > 0) {
1803 Py_UNICODE ch = *s++;
1804 Py_UNICODE ch2 = 0;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00001805#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001806 if (ch >= 0x10000) {
Tim Peters772747b2001-08-09 22:21:55 +00001807 ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF);
1808 ch = 0xD800 | ((ch-0x10000) >> 10);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001809 }
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00001810#endif
Tim Peters772747b2001-08-09 22:21:55 +00001811 STORECHAR(ch);
1812 if (ch2)
1813 STORECHAR(ch2);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001814 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001815 return v;
Tim Peters772747b2001-08-09 22:21:55 +00001816#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00001817}
1818
1819PyObject *PyUnicode_AsUTF16String(PyObject *unicode)
1820{
1821 if (!PyUnicode_Check(unicode)) {
1822 PyErr_BadArgument();
1823 return NULL;
1824 }
1825 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
1826 PyUnicode_GET_SIZE(unicode),
1827 NULL,
1828 0);
1829}
1830
1831/* --- Unicode Escape Codec ----------------------------------------------- */
1832
Fredrik Lundh06d12682001-01-24 07:59:11 +00001833static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00001834
Guido van Rossumd57fd912000-03-10 22:53:23 +00001835PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001836 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00001837 const char *errors)
1838{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001839 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001840 Py_ssize_t startinpos;
1841 Py_ssize_t endinpos;
1842 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001843 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001844 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001845 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001846 const char *end;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001847 char* message;
1848 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001849 PyObject *errorHandler = NULL;
1850 PyObject *exc = NULL;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001851
Guido van Rossumd57fd912000-03-10 22:53:23 +00001852 /* Escaped strings will always be longer than the resulting
1853 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001854 length after conversion to the true value.
1855 (but if the error callback returns a long replacement string
1856 we'll have to allocate more space) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001857 v = _PyUnicode_New(size);
1858 if (v == NULL)
1859 goto onError;
1860 if (size == 0)
1861 return (PyObject *)v;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001862
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001863 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001864 end = s + size;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001865
Guido van Rossumd57fd912000-03-10 22:53:23 +00001866 while (s < end) {
1867 unsigned char c;
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00001868 Py_UNICODE x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001869 int digits;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001870
1871 /* Non-escape characters are interpreted as Unicode ordinals */
1872 if (*s != '\\') {
Fredrik Lundhccc74732001-02-18 22:13:49 +00001873 *p++ = (unsigned char) *s++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001874 continue;
1875 }
1876
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001877 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001878 /* \ - Escapes */
1879 s++;
1880 switch (*s++) {
1881
1882 /* \x escapes */
1883 case '\n': break;
1884 case '\\': *p++ = '\\'; break;
1885 case '\'': *p++ = '\''; break;
1886 case '\"': *p++ = '\"'; break;
1887 case 'b': *p++ = '\b'; break;
1888 case 'f': *p++ = '\014'; break; /* FF */
1889 case 't': *p++ = '\t'; break;
1890 case 'n': *p++ = '\n'; break;
1891 case 'r': *p++ = '\r'; break;
1892 case 'v': *p++ = '\013'; break; /* VT */
1893 case 'a': *p++ = '\007'; break; /* BEL, not classic C */
1894
1895 /* \OOO (octal) escapes */
1896 case '0': case '1': case '2': case '3':
1897 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001898 x = s[-1] - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00001899 if ('0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001900 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00001901 if ('0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001902 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00001903 }
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001904 *p++ = x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001905 break;
1906
Fredrik Lundhccc74732001-02-18 22:13:49 +00001907 /* hex escapes */
1908 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001909 case 'x':
Fredrik Lundhccc74732001-02-18 22:13:49 +00001910 digits = 2;
1911 message = "truncated \\xXX escape";
1912 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001913
Fredrik Lundhccc74732001-02-18 22:13:49 +00001914 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001915 case 'u':
Fredrik Lundhccc74732001-02-18 22:13:49 +00001916 digits = 4;
1917 message = "truncated \\uXXXX escape";
1918 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001919
Fredrik Lundhccc74732001-02-18 22:13:49 +00001920 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00001921 case 'U':
Fredrik Lundhccc74732001-02-18 22:13:49 +00001922 digits = 8;
1923 message = "truncated \\UXXXXXXXX escape";
1924 hexescape:
1925 chr = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001926 outpos = p-PyUnicode_AS_UNICODE(v);
1927 if (s+digits>end) {
1928 endinpos = size;
1929 if (unicode_decode_call_errorhandler(
1930 errors, &errorHandler,
1931 "unicodeescape", "end of string in escape sequence",
1932 starts, size, &startinpos, &endinpos, &exc, &s,
1933 (PyObject **)&v, &outpos, &p))
1934 goto onError;
1935 goto nextByte;
1936 }
1937 for (i = 0; i < digits; ++i) {
Fredrik Lundhccc74732001-02-18 22:13:49 +00001938 c = (unsigned char) s[i];
Fredrik Lundhdf846752000-09-03 11:29:49 +00001939 if (!isxdigit(c)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001940 endinpos = (s+i+1)-starts;
1941 if (unicode_decode_call_errorhandler(
1942 errors, &errorHandler,
1943 "unicodeescape", message,
1944 starts, size, &startinpos, &endinpos, &exc, &s,
1945 (PyObject **)&v, &outpos, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00001946 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001947 goto nextByte;
Fredrik Lundhdf846752000-09-03 11:29:49 +00001948 }
1949 chr = (chr<<4) & ~0xF;
1950 if (c >= '0' && c <= '9')
1951 chr += c - '0';
1952 else if (c >= 'a' && c <= 'f')
1953 chr += 10 + c - 'a';
1954 else
1955 chr += 10 + c - 'A';
1956 }
1957 s += i;
Jeremy Hylton504de6b2003-10-06 05:08:26 +00001958 if (chr == 0xffffffff && PyErr_Occurred())
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001959 /* _decoding_error will have already written into the
1960 target buffer. */
1961 break;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001962 store:
Fredrik Lundhdf846752000-09-03 11:29:49 +00001963 /* when we get here, chr is a 32-bit unicode character */
1964 if (chr <= 0xffff)
1965 /* UCS-2 character */
1966 *p++ = (Py_UNICODE) chr;
1967 else if (chr <= 0x10ffff) {
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001968 /* UCS-4 character. Either store directly, or as
Walter Dörwald8c077222002-03-25 11:16:18 +00001969 surrogate pair. */
Fredrik Lundh8f455852001-06-27 18:59:43 +00001970#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001971 *p++ = chr;
1972#else
Fredrik Lundhdf846752000-09-03 11:29:49 +00001973 chr -= 0x10000L;
1974 *p++ = 0xD800 + (Py_UNICODE) (chr >> 10);
Fredrik Lundh45714e92001-06-26 16:39:36 +00001975 *p++ = 0xDC00 + (Py_UNICODE) (chr & 0x03FF);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001976#endif
Fredrik Lundhdf846752000-09-03 11:29:49 +00001977 } else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001978 endinpos = s-starts;
1979 outpos = p-PyUnicode_AS_UNICODE(v);
1980 if (unicode_decode_call_errorhandler(
1981 errors, &errorHandler,
1982 "unicodeescape", "illegal Unicode character",
1983 starts, size, &startinpos, &endinpos, &exc, &s,
1984 (PyObject **)&v, &outpos, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00001985 goto onError;
1986 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00001987 break;
1988
1989 /* \N{name} */
1990 case 'N':
1991 message = "malformed \\N character escape";
1992 if (ucnhash_CAPI == NULL) {
1993 /* load the unicode data module */
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00001994 PyObject *m, *api;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001995 m = PyImport_ImportModule("unicodedata");
1996 if (m == NULL)
1997 goto ucnhashError;
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00001998 api = PyObject_GetAttrString(m, "ucnhash_CAPI");
Fredrik Lundhccc74732001-02-18 22:13:49 +00001999 Py_DECREF(m);
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00002000 if (api == NULL)
Fredrik Lundhccc74732001-02-18 22:13:49 +00002001 goto ucnhashError;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002002 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCObject_AsVoidPtr(api);
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00002003 Py_DECREF(api);
Fredrik Lundhccc74732001-02-18 22:13:49 +00002004 if (ucnhash_CAPI == NULL)
2005 goto ucnhashError;
2006 }
2007 if (*s == '{') {
2008 const char *start = s+1;
2009 /* look for the closing brace */
2010 while (*s != '}' && s < end)
2011 s++;
2012 if (s > start && s < end && *s == '}') {
2013 /* found a name. look it up in the unicode database */
2014 message = "unknown Unicode character name";
2015 s++;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +00002016 if (ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1), &chr))
Fredrik Lundhccc74732001-02-18 22:13:49 +00002017 goto store;
2018 }
2019 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002020 endinpos = s-starts;
2021 outpos = p-PyUnicode_AS_UNICODE(v);
2022 if (unicode_decode_call_errorhandler(
2023 errors, &errorHandler,
2024 "unicodeescape", message,
2025 starts, size, &startinpos, &endinpos, &exc, &s,
2026 (PyObject **)&v, &outpos, &p))
Fredrik Lundhccc74732001-02-18 22:13:49 +00002027 goto onError;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002028 break;
2029
2030 default:
Walter Dörwald8c077222002-03-25 11:16:18 +00002031 if (s > end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002032 message = "\\ at end of string";
2033 s--;
2034 endinpos = s-starts;
2035 outpos = p-PyUnicode_AS_UNICODE(v);
2036 if (unicode_decode_call_errorhandler(
2037 errors, &errorHandler,
2038 "unicodeescape", message,
2039 starts, size, &startinpos, &endinpos, &exc, &s,
2040 (PyObject **)&v, &outpos, &p))
Walter Dörwald8c077222002-03-25 11:16:18 +00002041 goto onError;
2042 }
2043 else {
2044 *p++ = '\\';
2045 *p++ = (unsigned char)s[-1];
2046 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00002047 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002048 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002049 nextByte:
2050 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002051 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002052 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002053 goto onError;
Walter Dörwaldd4ade082003-08-15 15:00:26 +00002054 Py_XDECREF(errorHandler);
2055 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002056 return (PyObject *)v;
Walter Dörwald8c077222002-03-25 11:16:18 +00002057
Fredrik Lundhccc74732001-02-18 22:13:49 +00002058ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00002059 PyErr_SetString(
2060 PyExc_UnicodeError,
2061 "\\N escapes not supported (can't load unicodedata module)"
2062 );
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00002063 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002064 Py_XDECREF(errorHandler);
2065 Py_XDECREF(exc);
Fredrik Lundhf6056062001-01-20 11:15:25 +00002066 return NULL;
2067
Fredrik Lundhccc74732001-02-18 22:13:49 +00002068onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00002069 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002070 Py_XDECREF(errorHandler);
2071 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002072 return NULL;
2073}
2074
2075/* Return a Unicode-Escape string version of the Unicode object.
2076
2077 If quotes is true, the string is enclosed in u"" or u'' quotes as
2078 appropriate.
2079
2080*/
2081
Thomas Wouters477c8d52006-05-27 19:21:47 +00002082Py_LOCAL_INLINE(const Py_UNICODE *) findchar(const Py_UNICODE *s,
2083 Py_ssize_t size,
2084 Py_UNICODE ch)
2085{
2086 /* like wcschr, but doesn't stop at NULL characters */
2087
2088 while (size-- > 0) {
2089 if (*s == ch)
2090 return s;
2091 s++;
2092 }
2093
2094 return NULL;
2095}
Barry Warsaw51ac5802000-03-20 16:36:48 +00002096
Walter Dörwald79e913e2007-05-12 11:08:06 +00002097static const char *hexdigits = "0123456789abcdef";
2098
2099PyObject *PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
2100 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002101{
2102 PyObject *repr;
2103 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002104
Thomas Wouters89f507f2006-12-13 04:49:30 +00002105 /* XXX(nnorwitz): rather than over-allocating, it would be
2106 better to choose a different scheme. Perhaps scan the
2107 first N-chars of the string and allocate based on that size.
2108 */
2109 /* Initial allocation is based on the longest-possible unichr
2110 escape.
2111
2112 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
2113 unichr, so in this case it's the longest unichr escape. In
2114 narrow (UTF-16) builds this is five chars per source unichr
2115 since there are two unichrs in the surrogate pair, so in narrow
2116 (UTF-16) builds it's not the longest unichr escape.
2117
2118 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
2119 so in the narrow (UTF-16) build case it's the longest unichr
2120 escape.
2121 */
2122
Walter Dörwald79e913e2007-05-12 11:08:06 +00002123 repr = PyBytes_FromStringAndSize(NULL,
Thomas Wouters89f507f2006-12-13 04:49:30 +00002124#ifdef Py_UNICODE_WIDE
2125 + 10*size
2126#else
2127 + 6*size
2128#endif
2129 + 1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002130 if (repr == NULL)
2131 return NULL;
2132
Walter Dörwald79e913e2007-05-12 11:08:06 +00002133 p = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002134
Guido van Rossumd57fd912000-03-10 22:53:23 +00002135 while (size-- > 0) {
2136 Py_UNICODE ch = *s++;
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002137
Walter Dörwald79e913e2007-05-12 11:08:06 +00002138 /* Escape backslashes */
2139 if (ch == '\\') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002140 *p++ = '\\';
2141 *p++ = (char) ch;
Walter Dörwald79e913e2007-05-12 11:08:06 +00002142 continue;
Tim Petersced69f82003-09-16 20:30:58 +00002143 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002144
Guido van Rossum0d42e0c2001-07-20 16:36:21 +00002145#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002146 /* Map 21-bit characters to '\U00xxxxxx' */
2147 else if (ch >= 0x10000) {
2148 *p++ = '\\';
2149 *p++ = 'U';
Walter Dörwald79e913e2007-05-12 11:08:06 +00002150 *p++ = hexdigits[(ch >> 28) & 0x0000000F];
2151 *p++ = hexdigits[(ch >> 24) & 0x0000000F];
2152 *p++ = hexdigits[(ch >> 20) & 0x0000000F];
2153 *p++ = hexdigits[(ch >> 16) & 0x0000000F];
2154 *p++ = hexdigits[(ch >> 12) & 0x0000000F];
2155 *p++ = hexdigits[(ch >> 8) & 0x0000000F];
2156 *p++ = hexdigits[(ch >> 4) & 0x0000000F];
2157 *p++ = hexdigits[ch & 0x0000000F];
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002158 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002159 }
Thomas Wouters89f507f2006-12-13 04:49:30 +00002160#else
2161 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002162 else if (ch >= 0xD800 && ch < 0xDC00) {
2163 Py_UNICODE ch2;
2164 Py_UCS4 ucs;
Tim Petersced69f82003-09-16 20:30:58 +00002165
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002166 ch2 = *s++;
2167 size--;
2168 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
2169 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
2170 *p++ = '\\';
2171 *p++ = 'U';
Walter Dörwald79e913e2007-05-12 11:08:06 +00002172 *p++ = hexdigits[(ucs >> 28) & 0x0000000F];
2173 *p++ = hexdigits[(ucs >> 24) & 0x0000000F];
2174 *p++ = hexdigits[(ucs >> 20) & 0x0000000F];
2175 *p++ = hexdigits[(ucs >> 16) & 0x0000000F];
2176 *p++ = hexdigits[(ucs >> 12) & 0x0000000F];
2177 *p++ = hexdigits[(ucs >> 8) & 0x0000000F];
2178 *p++ = hexdigits[(ucs >> 4) & 0x0000000F];
2179 *p++ = hexdigits[ucs & 0x0000000F];
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002180 continue;
2181 }
2182 /* Fall through: isolated surrogates are copied as-is */
2183 s--;
2184 size++;
2185 }
Thomas Wouters89f507f2006-12-13 04:49:30 +00002186#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002187
Guido van Rossumd57fd912000-03-10 22:53:23 +00002188 /* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002189 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002190 *p++ = '\\';
2191 *p++ = 'u';
Walter Dörwald79e913e2007-05-12 11:08:06 +00002192 *p++ = hexdigits[(ch >> 12) & 0x000F];
2193 *p++ = hexdigits[(ch >> 8) & 0x000F];
2194 *p++ = hexdigits[(ch >> 4) & 0x000F];
2195 *p++ = hexdigits[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00002196 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002197
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00002198 /* Map special whitespace to '\t', \n', '\r' */
2199 else if (ch == '\t') {
2200 *p++ = '\\';
2201 *p++ = 't';
2202 }
2203 else if (ch == '\n') {
2204 *p++ = '\\';
2205 *p++ = 'n';
2206 }
2207 else if (ch == '\r') {
2208 *p++ = '\\';
2209 *p++ = 'r';
2210 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002211
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00002212 /* Map non-printable US ASCII to '\xhh' */
Marc-André Lemburg11326de2001-11-28 12:56:20 +00002213 else if (ch < ' ' || ch >= 0x7F) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002214 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00002215 *p++ = 'x';
Walter Dörwald79e913e2007-05-12 11:08:06 +00002216 *p++ = hexdigits[(ch >> 4) & 0x000F];
2217 *p++ = hexdigits[ch & 0x000F];
Tim Petersced69f82003-09-16 20:30:58 +00002218 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002219
Guido van Rossumd57fd912000-03-10 22:53:23 +00002220 /* Copy everything else as-is */
2221 else
2222 *p++ = (char) ch;
2223 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002224
2225 *p = '\0';
Walter Dörwald79e913e2007-05-12 11:08:06 +00002226 if (PyBytes_Resize(repr, p - PyBytes_AS_STRING(repr))) {
2227 Py_DECREF(repr);
2228 return NULL;
2229 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002230 return repr;
2231}
2232
Guido van Rossumd57fd912000-03-10 22:53:23 +00002233PyObject *PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
2234{
Walter Dörwald79e913e2007-05-12 11:08:06 +00002235 PyObject *s, *result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002236 if (!PyUnicode_Check(unicode)) {
2237 PyErr_BadArgument();
2238 return NULL;
2239 }
Walter Dörwald79e913e2007-05-12 11:08:06 +00002240 s = PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
2241 PyUnicode_GET_SIZE(unicode));
2242
2243 if (!s)
2244 return NULL;
2245 result = PyString_FromStringAndSize(PyBytes_AS_STRING(s),
2246 PyBytes_GET_SIZE(s));
2247 Py_DECREF(s);
2248 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002249}
2250
2251/* --- Raw Unicode Escape Codec ------------------------------------------- */
2252
2253PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002254 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00002255 const char *errors)
2256{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002257 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002258 Py_ssize_t startinpos;
2259 Py_ssize_t endinpos;
2260 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002261 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002262 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002263 const char *end;
2264 const char *bs;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002265 PyObject *errorHandler = NULL;
2266 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00002267
Guido van Rossumd57fd912000-03-10 22:53:23 +00002268 /* Escaped strings will always be longer than the resulting
2269 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002270 length after conversion to the true value. (But decoding error
2271 handler might have to resize the string) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002272 v = _PyUnicode_New(size);
2273 if (v == NULL)
2274 goto onError;
2275 if (size == 0)
2276 return (PyObject *)v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002277 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002278 end = s + size;
2279 while (s < end) {
2280 unsigned char c;
Martin v. Löwis047c05e2002-03-21 08:55:28 +00002281 Py_UCS4 x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002282 int i;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002283 int count;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002284
2285 /* Non-escape characters are interpreted as Unicode ordinals */
2286 if (*s != '\\') {
2287 *p++ = (unsigned char)*s++;
2288 continue;
2289 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002290 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002291
2292 /* \u-escapes are only interpreted iff the number of leading
2293 backslashes if odd */
2294 bs = s;
2295 for (;s < end;) {
2296 if (*s != '\\')
2297 break;
2298 *p++ = (unsigned char)*s++;
2299 }
2300 if (((s - bs) & 1) == 0 ||
2301 s >= end ||
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002302 (*s != 'u' && *s != 'U')) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002303 continue;
2304 }
2305 p--;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002306 count = *s=='u' ? 4 : 8;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002307 s++;
2308
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002309 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002310 outpos = p-PyUnicode_AS_UNICODE(v);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002311 for (x = 0, i = 0; i < count; ++i, ++s) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002312 c = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002313 if (!isxdigit(c)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002314 endinpos = s-starts;
2315 if (unicode_decode_call_errorhandler(
2316 errors, &errorHandler,
2317 "rawunicodeescape", "truncated \\uXXXX",
2318 starts, size, &startinpos, &endinpos, &exc, &s,
2319 (PyObject **)&v, &outpos, &p))
Guido van Rossumd57fd912000-03-10 22:53:23 +00002320 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002321 goto nextByte;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002322 }
2323 x = (x<<4) & ~0xF;
2324 if (c >= '0' && c <= '9')
2325 x += c - '0';
2326 else if (c >= 'a' && c <= 'f')
2327 x += 10 + c - 'a';
2328 else
2329 x += 10 + c - 'A';
2330 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002331#ifndef Py_UNICODE_WIDE
2332 if (x > 0x10000) {
2333 if (unicode_decode_call_errorhandler(
2334 errors, &errorHandler,
2335 "rawunicodeescape", "\\Uxxxxxxxx out of range",
2336 starts, size, &startinpos, &endinpos, &exc, &s,
2337 (PyObject **)&v, &outpos, &p))
2338 goto onError;
2339 }
2340#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002341 *p++ = x;
2342 nextByte:
2343 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002344 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002345 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002346 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002347 Py_XDECREF(errorHandler);
2348 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002349 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00002350
Guido van Rossumd57fd912000-03-10 22:53:23 +00002351 onError:
2352 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002353 Py_XDECREF(errorHandler);
2354 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002355 return NULL;
2356}
2357
2358PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002359 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002360{
2361 PyObject *repr;
2362 char *p;
2363 char *q;
2364
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002365#ifdef Py_UNICODE_WIDE
2366 repr = PyString_FromStringAndSize(NULL, 10 * size);
2367#else
Guido van Rossumd57fd912000-03-10 22:53:23 +00002368 repr = PyString_FromStringAndSize(NULL, 6 * size);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002369#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00002370 if (repr == NULL)
2371 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00002372 if (size == 0)
2373 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002374
2375 p = q = PyString_AS_STRING(repr);
2376 while (size-- > 0) {
2377 Py_UNICODE ch = *s++;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002378#ifdef Py_UNICODE_WIDE
2379 /* Map 32-bit characters to '\Uxxxxxxxx' */
2380 if (ch >= 0x10000) {
2381 *p++ = '\\';
2382 *p++ = 'U';
Walter Dörwalddb5d33e2007-05-12 11:13:47 +00002383 *p++ = hexdigits[(ch >> 28) & 0xf];
2384 *p++ = hexdigits[(ch >> 24) & 0xf];
2385 *p++ = hexdigits[(ch >> 20) & 0xf];
2386 *p++ = hexdigits[(ch >> 16) & 0xf];
2387 *p++ = hexdigits[(ch >> 12) & 0xf];
2388 *p++ = hexdigits[(ch >> 8) & 0xf];
2389 *p++ = hexdigits[(ch >> 4) & 0xf];
2390 *p++ = hexdigits[ch & 15];
Tim Petersced69f82003-09-16 20:30:58 +00002391 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002392 else
2393#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00002394 /* Map 16-bit characters to '\uxxxx' */
2395 if (ch >= 256) {
2396 *p++ = '\\';
2397 *p++ = 'u';
Walter Dörwalddb5d33e2007-05-12 11:13:47 +00002398 *p++ = hexdigits[(ch >> 12) & 0xf];
2399 *p++ = hexdigits[(ch >> 8) & 0xf];
2400 *p++ = hexdigits[(ch >> 4) & 0xf];
2401 *p++ = hexdigits[ch & 15];
Guido van Rossumd57fd912000-03-10 22:53:23 +00002402 }
2403 /* Copy everything else as-is */
2404 else
2405 *p++ = (char) ch;
2406 }
2407 *p = '\0';
Tim Peters5de98422002-04-27 18:44:32 +00002408 _PyString_Resize(&repr, p - q);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002409 return repr;
2410}
2411
2412PyObject *PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
2413{
2414 if (!PyUnicode_Check(unicode)) {
2415 PyErr_BadArgument();
2416 return NULL;
2417 }
2418 return PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
2419 PyUnicode_GET_SIZE(unicode));
2420}
2421
Walter Dörwalda47d1c02005-08-30 10:23:14 +00002422/* --- Unicode Internal Codec ------------------------------------------- */
2423
2424PyObject *_PyUnicode_DecodeUnicodeInternal(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002425 Py_ssize_t size,
Walter Dörwalda47d1c02005-08-30 10:23:14 +00002426 const char *errors)
2427{
2428 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002429 Py_ssize_t startinpos;
2430 Py_ssize_t endinpos;
2431 Py_ssize_t outpos;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00002432 PyUnicodeObject *v;
2433 Py_UNICODE *p;
2434 const char *end;
2435 const char *reason;
2436 PyObject *errorHandler = NULL;
2437 PyObject *exc = NULL;
2438
Neal Norwitzd43069c2006-01-08 01:12:10 +00002439#ifdef Py_UNICODE_WIDE
2440 Py_UNICODE unimax = PyUnicode_GetMax();
2441#endif
2442
Thomas Wouters89f507f2006-12-13 04:49:30 +00002443 /* XXX overflow detection missing */
Walter Dörwalda47d1c02005-08-30 10:23:14 +00002444 v = _PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE);
2445 if (v == NULL)
2446 goto onError;
2447 if (PyUnicode_GetSize((PyObject *)v) == 0)
2448 return (PyObject *)v;
2449 p = PyUnicode_AS_UNICODE(v);
2450 end = s + size;
2451
2452 while (s < end) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00002453 memcpy(p, s, sizeof(Py_UNICODE));
Walter Dörwalda47d1c02005-08-30 10:23:14 +00002454 /* We have to sanity check the raw data, otherwise doom looms for
2455 some malformed UCS-4 data. */
2456 if (
2457 #ifdef Py_UNICODE_WIDE
2458 *p > unimax || *p < 0 ||
2459 #endif
2460 end-s < Py_UNICODE_SIZE
2461 )
2462 {
2463 startinpos = s - starts;
2464 if (end-s < Py_UNICODE_SIZE) {
2465 endinpos = end-starts;
2466 reason = "truncated input";
2467 }
2468 else {
2469 endinpos = s - starts + Py_UNICODE_SIZE;
2470 reason = "illegal code point (> 0x10FFFF)";
2471 }
2472 outpos = p - PyUnicode_AS_UNICODE(v);
2473 if (unicode_decode_call_errorhandler(
2474 errors, &errorHandler,
2475 "unicode_internal", reason,
2476 starts, size, &startinpos, &endinpos, &exc, &s,
2477 (PyObject **)&v, &outpos, &p)) {
2478 goto onError;
2479 }
2480 }
2481 else {
2482 p++;
2483 s += Py_UNICODE_SIZE;
2484 }
2485 }
2486
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002487 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00002488 goto onError;
2489 Py_XDECREF(errorHandler);
2490 Py_XDECREF(exc);
2491 return (PyObject *)v;
2492
2493 onError:
2494 Py_XDECREF(v);
2495 Py_XDECREF(errorHandler);
2496 Py_XDECREF(exc);
2497 return NULL;
2498}
2499
Guido van Rossumd57fd912000-03-10 22:53:23 +00002500/* --- Latin-1 Codec ------------------------------------------------------ */
2501
2502PyObject *PyUnicode_DecodeLatin1(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002503 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00002504 const char *errors)
2505{
2506 PyUnicodeObject *v;
2507 Py_UNICODE *p;
Tim Petersced69f82003-09-16 20:30:58 +00002508
Guido van Rossumd57fd912000-03-10 22:53:23 +00002509 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002510 if (size == 1) {
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002511 Py_UNICODE r = *(unsigned char*)s;
2512 return PyUnicode_FromUnicode(&r, 1);
2513 }
2514
Guido van Rossumd57fd912000-03-10 22:53:23 +00002515 v = _PyUnicode_New(size);
2516 if (v == NULL)
2517 goto onError;
2518 if (size == 0)
2519 return (PyObject *)v;
2520 p = PyUnicode_AS_UNICODE(v);
2521 while (size-- > 0)
2522 *p++ = (unsigned char)*s++;
2523 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00002524
Guido van Rossumd57fd912000-03-10 22:53:23 +00002525 onError:
2526 Py_XDECREF(v);
2527 return NULL;
2528}
2529
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002530/* create or adjust a UnicodeEncodeError */
2531static void make_encode_exception(PyObject **exceptionObject,
2532 const char *encoding,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002533 const Py_UNICODE *unicode, Py_ssize_t size,
2534 Py_ssize_t startpos, Py_ssize_t endpos,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002535 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002536{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002537 if (*exceptionObject == NULL) {
2538 *exceptionObject = PyUnicodeEncodeError_Create(
2539 encoding, unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002540 }
2541 else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002542 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
2543 goto onError;
2544 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
2545 goto onError;
2546 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
2547 goto onError;
2548 return;
2549 onError:
2550 Py_DECREF(*exceptionObject);
2551 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002552 }
2553}
2554
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002555/* raises a UnicodeEncodeError */
2556static void raise_encode_exception(PyObject **exceptionObject,
2557 const char *encoding,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002558 const Py_UNICODE *unicode, Py_ssize_t size,
2559 Py_ssize_t startpos, Py_ssize_t endpos,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002560 const char *reason)
2561{
2562 make_encode_exception(exceptionObject,
2563 encoding, unicode, size, startpos, endpos, reason);
2564 if (*exceptionObject != NULL)
2565 PyCodec_StrictErrors(*exceptionObject);
2566}
2567
2568/* error handling callback helper:
2569 build arguments, call the callback and check the arguments,
2570 put the result into newpos and return the replacement string, which
2571 has to be freed by the caller */
2572static PyObject *unicode_encode_call_errorhandler(const char *errors,
2573 PyObject **errorHandler,
2574 const char *encoding, const char *reason,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002575 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
2576 Py_ssize_t startpos, Py_ssize_t endpos,
2577 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002578{
Martin v. Löwis18e16552006-02-15 17:27:45 +00002579 static char *argparse = "O!n;encoding error handler must return (unicode, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002580
2581 PyObject *restuple;
2582 PyObject *resunicode;
2583
2584 if (*errorHandler == NULL) {
2585 *errorHandler = PyCodec_LookupError(errors);
2586 if (*errorHandler == NULL)
2587 return NULL;
2588 }
2589
2590 make_encode_exception(exceptionObject,
2591 encoding, unicode, size, startpos, endpos, reason);
2592 if (*exceptionObject == NULL)
2593 return NULL;
2594
2595 restuple = PyObject_CallFunctionObjArgs(
2596 *errorHandler, *exceptionObject, NULL);
2597 if (restuple == NULL)
2598 return NULL;
2599 if (!PyTuple_Check(restuple)) {
2600 PyErr_Format(PyExc_TypeError, &argparse[4]);
2601 Py_DECREF(restuple);
2602 return NULL;
2603 }
2604 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
2605 &resunicode, newpos)) {
2606 Py_DECREF(restuple);
2607 return NULL;
2608 }
2609 if (*newpos<0)
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00002610 *newpos = size+*newpos;
2611 if (*newpos<0 || *newpos>size) {
Martin v. Löwis2c95cc62006-02-16 06:54:25 +00002612 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00002613 Py_DECREF(restuple);
2614 return NULL;
2615 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002616 Py_INCREF(resunicode);
2617 Py_DECREF(restuple);
2618 return resunicode;
2619}
2620
2621static PyObject *unicode_encode_ucs1(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002622 Py_ssize_t size,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002623 const char *errors,
2624 int limit)
2625{
2626 /* output object */
2627 PyObject *res;
2628 /* pointers to the beginning and end+1 of input */
2629 const Py_UNICODE *startp = p;
2630 const Py_UNICODE *endp = p + size;
2631 /* pointer to the beginning of the unencodable characters */
2632 /* const Py_UNICODE *badp = NULL; */
2633 /* pointer into the output */
2634 char *str;
2635 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00002636 Py_ssize_t respos = 0;
2637 Py_ssize_t ressize;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002638 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
2639 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002640 PyObject *errorHandler = NULL;
2641 PyObject *exc = NULL;
2642 /* the following variable is used for caching string comparisons
2643 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
2644 int known_errorHandler = -1;
2645
2646 /* allocate enough for a simple encoding without
2647 replacements, if we need more, we'll resize */
Guido van Rossumf15a29f2007-05-04 00:41:39 +00002648 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002649 if (res == NULL)
2650 goto onError;
2651 if (size == 0)
2652 return res;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00002653 str = PyBytes_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002654 ressize = size;
2655
2656 while (p<endp) {
2657 Py_UNICODE c = *p;
2658
2659 /* can we encode this? */
2660 if (c<limit) {
2661 /* no overflow check, because we know that the space is enough */
2662 *str++ = (char)c;
2663 ++p;
2664 }
2665 else {
Martin v. Löwis18e16552006-02-15 17:27:45 +00002666 Py_ssize_t unicodepos = p-startp;
2667 Py_ssize_t requiredsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002668 PyObject *repunicode;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002669 Py_ssize_t repsize;
2670 Py_ssize_t newpos;
2671 Py_ssize_t respos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002672 Py_UNICODE *uni2;
2673 /* startpos for collecting unencodable chars */
2674 const Py_UNICODE *collstart = p;
2675 const Py_UNICODE *collend = p;
2676 /* find all unecodable characters */
2677 while ((collend < endp) && ((*collend)>=limit))
2678 ++collend;
2679 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
2680 if (known_errorHandler==-1) {
2681 if ((errors==NULL) || (!strcmp(errors, "strict")))
2682 known_errorHandler = 1;
2683 else if (!strcmp(errors, "replace"))
2684 known_errorHandler = 2;
2685 else if (!strcmp(errors, "ignore"))
2686 known_errorHandler = 3;
2687 else if (!strcmp(errors, "xmlcharrefreplace"))
2688 known_errorHandler = 4;
2689 else
2690 known_errorHandler = 0;
2691 }
2692 switch (known_errorHandler) {
2693 case 1: /* strict */
2694 raise_encode_exception(&exc, encoding, startp, size, collstart-startp, collend-startp, reason);
2695 goto onError;
2696 case 2: /* replace */
2697 while (collstart++<collend)
2698 *str++ = '?'; /* fall through */
2699 case 3: /* ignore */
2700 p = collend;
2701 break;
2702 case 4: /* xmlcharrefreplace */
Guido van Rossumf15a29f2007-05-04 00:41:39 +00002703 respos = str - PyBytes_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002704 /* determine replacement size (temporarily (mis)uses p) */
2705 for (p = collstart, repsize = 0; p < collend; ++p) {
2706 if (*p<10)
2707 repsize += 2+1+1;
2708 else if (*p<100)
2709 repsize += 2+2+1;
2710 else if (*p<1000)
2711 repsize += 2+3+1;
2712 else if (*p<10000)
2713 repsize += 2+4+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00002714#ifndef Py_UNICODE_WIDE
2715 else
2716 repsize += 2+5+1;
2717#else
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002718 else if (*p<100000)
2719 repsize += 2+5+1;
2720 else if (*p<1000000)
2721 repsize += 2+6+1;
2722 else
2723 repsize += 2+7+1;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002724#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002725 }
2726 requiredsize = respos+repsize+(endp-collend);
2727 if (requiredsize > ressize) {
2728 if (requiredsize<2*ressize)
2729 requiredsize = 2*ressize;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00002730 if (PyBytes_Resize(res, requiredsize))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002731 goto onError;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00002732 str = PyBytes_AS_STRING(res) + respos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002733 ressize = requiredsize;
2734 }
2735 /* generate replacement (temporarily (mis)uses p) */
2736 for (p = collstart; p < collend; ++p) {
2737 str += sprintf(str, "&#%d;", (int)*p);
2738 }
2739 p = collend;
2740 break;
2741 default:
2742 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
2743 encoding, reason, startp, size, &exc,
2744 collstart-startp, collend-startp, &newpos);
2745 if (repunicode == NULL)
2746 goto onError;
2747 /* need more space? (at least enough for what we
2748 have+the replacement+the rest of the string, so
2749 we won't have to check space for encodable characters) */
Guido van Rossumf15a29f2007-05-04 00:41:39 +00002750 respos = str - PyBytes_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002751 repsize = PyUnicode_GET_SIZE(repunicode);
2752 requiredsize = respos+repsize+(endp-collend);
2753 if (requiredsize > ressize) {
2754 if (requiredsize<2*ressize)
2755 requiredsize = 2*ressize;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00002756 if (PyBytes_Resize(res, requiredsize)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002757 Py_DECREF(repunicode);
2758 goto onError;
2759 }
Guido van Rossumf15a29f2007-05-04 00:41:39 +00002760 str = PyBytes_AS_STRING(res) + respos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002761 ressize = requiredsize;
2762 }
2763 /* check if there is anything unencodable in the replacement
2764 and copy it to the output */
2765 for (uni2 = PyUnicode_AS_UNICODE(repunicode);repsize-->0; ++uni2, ++str) {
2766 c = *uni2;
2767 if (c >= limit) {
2768 raise_encode_exception(&exc, encoding, startp, size,
2769 unicodepos, unicodepos+1, reason);
2770 Py_DECREF(repunicode);
2771 goto onError;
2772 }
2773 *str = (char)c;
2774 }
2775 p = startp + newpos;
2776 Py_DECREF(repunicode);
2777 }
2778 }
2779 }
2780 /* Resize if we allocated to much */
Guido van Rossumf15a29f2007-05-04 00:41:39 +00002781 respos = str - PyBytes_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002782 if (respos<ressize)
2783 /* If this falls res will be NULL */
Guido van Rossumf15a29f2007-05-04 00:41:39 +00002784 PyBytes_Resize(res, respos);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002785 Py_XDECREF(errorHandler);
2786 Py_XDECREF(exc);
2787 return res;
2788
2789 onError:
2790 Py_XDECREF(res);
2791 Py_XDECREF(errorHandler);
2792 Py_XDECREF(exc);
2793 return NULL;
2794}
2795
Guido van Rossumd57fd912000-03-10 22:53:23 +00002796PyObject *PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002797 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00002798 const char *errors)
2799{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002800 return unicode_encode_ucs1(p, size, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002801}
2802
2803PyObject *PyUnicode_AsLatin1String(PyObject *unicode)
2804{
2805 if (!PyUnicode_Check(unicode)) {
2806 PyErr_BadArgument();
2807 return NULL;
2808 }
2809 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
2810 PyUnicode_GET_SIZE(unicode),
2811 NULL);
2812}
2813
2814/* --- 7-bit ASCII Codec -------------------------------------------------- */
2815
Guido van Rossumd57fd912000-03-10 22:53:23 +00002816PyObject *PyUnicode_DecodeASCII(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002817 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00002818 const char *errors)
2819{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002820 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002821 PyUnicodeObject *v;
2822 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002823 Py_ssize_t startinpos;
2824 Py_ssize_t endinpos;
2825 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002826 const char *e;
2827 PyObject *errorHandler = NULL;
2828 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00002829
Guido van Rossumd57fd912000-03-10 22:53:23 +00002830 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002831 if (size == 1 && *(unsigned char*)s < 128) {
2832 Py_UNICODE r = *(unsigned char*)s;
2833 return PyUnicode_FromUnicode(&r, 1);
2834 }
Tim Petersced69f82003-09-16 20:30:58 +00002835
Guido van Rossumd57fd912000-03-10 22:53:23 +00002836 v = _PyUnicode_New(size);
2837 if (v == NULL)
2838 goto onError;
2839 if (size == 0)
2840 return (PyObject *)v;
2841 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002842 e = s + size;
2843 while (s < e) {
2844 register unsigned char c = (unsigned char)*s;
2845 if (c < 128) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002846 *p++ = c;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002847 ++s;
2848 }
2849 else {
2850 startinpos = s-starts;
2851 endinpos = startinpos + 1;
Jeremy Hyltond8082792003-09-16 19:41:39 +00002852 outpos = p - (Py_UNICODE *)PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002853 if (unicode_decode_call_errorhandler(
2854 errors, &errorHandler,
2855 "ascii", "ordinal not in range(128)",
2856 starts, size, &startinpos, &endinpos, &exc, &s,
2857 (PyObject **)&v, &outpos, &p))
Guido van Rossumd57fd912000-03-10 22:53:23 +00002858 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002859 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002860 }
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002861 if (p - PyUnicode_AS_UNICODE(v) < PyString_GET_SIZE(v))
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002862 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002863 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002864 Py_XDECREF(errorHandler);
2865 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002866 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00002867
Guido van Rossumd57fd912000-03-10 22:53:23 +00002868 onError:
2869 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002870 Py_XDECREF(errorHandler);
2871 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002872 return NULL;
2873}
2874
Guido van Rossumd57fd912000-03-10 22:53:23 +00002875PyObject *PyUnicode_EncodeASCII(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002876 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00002877 const char *errors)
2878{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002879 return unicode_encode_ucs1(p, size, errors, 128);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002880}
2881
2882PyObject *PyUnicode_AsASCIIString(PyObject *unicode)
2883{
2884 if (!PyUnicode_Check(unicode)) {
2885 PyErr_BadArgument();
2886 return NULL;
2887 }
2888 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
2889 PyUnicode_GET_SIZE(unicode),
2890 NULL);
2891}
2892
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00002893#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Guido van Rossum2ea3e142000-03-31 17:24:09 +00002894
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002895/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00002896
Thomas Wouters0e3f5912006-08-11 14:57:12 +00002897#if SIZEOF_INT < SIZEOF_SSIZE_T
2898#define NEED_RETRY
2899#endif
2900
2901/* XXX This code is limited to "true" double-byte encodings, as
2902 a) it assumes an incomplete character consists of a single byte, and
2903 b) IsDBCSLeadByte (probably) does not work for non-DBCS multi-byte
2904 encodings, see IsDBCSLeadByteEx documentation. */
2905
2906static int is_dbcs_lead_byte(const char *s, int offset)
2907{
2908 const char *curr = s + offset;
2909
2910 if (IsDBCSLeadByte(*curr)) {
2911 const char *prev = CharPrev(s, curr);
2912 return (prev == curr) || !IsDBCSLeadByte(*prev) || (curr - prev == 2);
2913 }
2914 return 0;
2915}
2916
2917/*
2918 * Decode MBCS string into unicode object. If 'final' is set, converts
2919 * trailing lead-byte too. Returns consumed size if succeed, -1 otherwise.
2920 */
2921static int decode_mbcs(PyUnicodeObject **v,
2922 const char *s, /* MBCS string */
2923 int size, /* sizeof MBCS string */
2924 int final)
2925{
2926 Py_UNICODE *p;
2927 Py_ssize_t n = 0;
2928 int usize = 0;
2929
2930 assert(size >= 0);
2931
2932 /* Skip trailing lead-byte unless 'final' is set */
2933 if (!final && size >= 1 && is_dbcs_lead_byte(s, size - 1))
2934 --size;
2935
2936 /* First get the size of the result */
2937 if (size > 0) {
2938 usize = MultiByteToWideChar(CP_ACP, 0, s, size, NULL, 0);
2939 if (usize == 0) {
2940 PyErr_SetFromWindowsErrWithFilename(0, NULL);
2941 return -1;
2942 }
2943 }
2944
2945 if (*v == NULL) {
2946 /* Create unicode object */
2947 *v = _PyUnicode_New(usize);
2948 if (*v == NULL)
2949 return -1;
2950 }
2951 else {
2952 /* Extend unicode object */
2953 n = PyUnicode_GET_SIZE(*v);
2954 if (_PyUnicode_Resize(v, n + usize) < 0)
2955 return -1;
2956 }
2957
2958 /* Do the conversion */
2959 if (size > 0) {
2960 p = PyUnicode_AS_UNICODE(*v) + n;
2961 if (0 == MultiByteToWideChar(CP_ACP, 0, s, size, p, usize)) {
2962 PyErr_SetFromWindowsErrWithFilename(0, NULL);
2963 return -1;
2964 }
2965 }
2966
2967 return size;
2968}
2969
2970PyObject *PyUnicode_DecodeMBCSStateful(const char *s,
2971 Py_ssize_t size,
2972 const char *errors,
2973 Py_ssize_t *consumed)
2974{
2975 PyUnicodeObject *v = NULL;
2976 int done;
2977
2978 if (consumed)
2979 *consumed = 0;
2980
2981#ifdef NEED_RETRY
2982 retry:
2983 if (size > INT_MAX)
2984 done = decode_mbcs(&v, s, INT_MAX, 0);
2985 else
2986#endif
2987 done = decode_mbcs(&v, s, (int)size, !consumed);
2988
2989 if (done < 0) {
2990 Py_XDECREF(v);
2991 return NULL;
2992 }
2993
2994 if (consumed)
2995 *consumed += done;
2996
2997#ifdef NEED_RETRY
2998 if (size > INT_MAX) {
2999 s += done;
3000 size -= done;
3001 goto retry;
3002 }
3003#endif
3004
3005 return (PyObject *)v;
3006}
3007
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003008PyObject *PyUnicode_DecodeMBCS(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003009 Py_ssize_t size,
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003010 const char *errors)
3011{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003012 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
3013}
3014
3015/*
3016 * Convert unicode into string object (MBCS).
3017 * Returns 0 if succeed, -1 otherwise.
3018 */
3019static int encode_mbcs(PyObject **repr,
3020 const Py_UNICODE *p, /* unicode */
3021 int size) /* size of unicode */
3022{
3023 int mbcssize = 0;
3024 Py_ssize_t n = 0;
3025
3026 assert(size >= 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003027
3028 /* First get the size of the result */
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003029 if (size > 0) {
3030 mbcssize = WideCharToMultiByte(CP_ACP, 0, p, size, NULL, 0, NULL, NULL);
3031 if (mbcssize == 0) {
3032 PyErr_SetFromWindowsErrWithFilename(0, NULL);
3033 return -1;
3034 }
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003035 }
3036
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003037 if (*repr == NULL) {
3038 /* Create string object */
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003039 *repr = PyBytes_FromStringAndSize(NULL, mbcssize);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003040 if (*repr == NULL)
3041 return -1;
3042 }
3043 else {
3044 /* Extend string object */
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003045 n = PyBytes_Size(*repr);
3046 if (PyBytes_Resize(*repr, n + mbcssize) < 0)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003047 return -1;
3048 }
3049
3050 /* Do the conversion */
3051 if (size > 0) {
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003052 char *s = PyBytes_AS_STRING(*repr) + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003053 if (0 == WideCharToMultiByte(CP_ACP, 0, p, size, s, mbcssize, NULL, NULL)) {
3054 PyErr_SetFromWindowsErrWithFilename(0, NULL);
3055 return -1;
3056 }
3057 }
3058
3059 return 0;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003060}
3061
3062PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003063 Py_ssize_t size,
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003064 const char *errors)
3065{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003066 PyObject *repr = NULL;
3067 int ret;
Guido van Rossum03e29f12000-05-04 15:52:20 +00003068
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003069#ifdef NEED_RETRY
3070 retry:
3071 if (size > INT_MAX)
3072 ret = encode_mbcs(&repr, p, INT_MAX);
3073 else
3074#endif
3075 ret = encode_mbcs(&repr, p, (int)size);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003076
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003077 if (ret < 0) {
3078 Py_XDECREF(repr);
3079 return NULL;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003080 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003081
3082#ifdef NEED_RETRY
3083 if (size > INT_MAX) {
3084 p += INT_MAX;
3085 size -= INT_MAX;
3086 goto retry;
3087 }
3088#endif
3089
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003090 return repr;
3091}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00003092
Mark Hammond0ccda1e2003-07-01 00:13:27 +00003093PyObject *PyUnicode_AsMBCSString(PyObject *unicode)
3094{
3095 if (!PyUnicode_Check(unicode)) {
3096 PyErr_BadArgument();
3097 return NULL;
3098 }
3099 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
3100 PyUnicode_GET_SIZE(unicode),
3101 NULL);
3102}
3103
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003104#undef NEED_RETRY
3105
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00003106#endif /* MS_WINDOWS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003107
Guido van Rossumd57fd912000-03-10 22:53:23 +00003108/* --- Character Mapping Codec -------------------------------------------- */
3109
Guido van Rossumd57fd912000-03-10 22:53:23 +00003110PyObject *PyUnicode_DecodeCharmap(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003111 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003112 PyObject *mapping,
3113 const char *errors)
3114{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003115 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003116 Py_ssize_t startinpos;
3117 Py_ssize_t endinpos;
3118 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003119 const char *e;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003120 PyUnicodeObject *v;
3121 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003122 Py_ssize_t extrachars = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003123 PyObject *errorHandler = NULL;
3124 PyObject *exc = NULL;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003125 Py_UNICODE *mapstring = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003126 Py_ssize_t maplen = 0;
Tim Petersced69f82003-09-16 20:30:58 +00003127
Guido van Rossumd57fd912000-03-10 22:53:23 +00003128 /* Default to Latin-1 */
3129 if (mapping == NULL)
3130 return PyUnicode_DecodeLatin1(s, size, errors);
3131
3132 v = _PyUnicode_New(size);
3133 if (v == NULL)
3134 goto onError;
3135 if (size == 0)
3136 return (PyObject *)v;
3137 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003138 e = s + size;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003139 if (PyUnicode_CheckExact(mapping)) {
3140 mapstring = PyUnicode_AS_UNICODE(mapping);
3141 maplen = PyUnicode_GET_SIZE(mapping);
3142 while (s < e) {
3143 unsigned char ch = *s;
3144 Py_UNICODE x = 0xfffe; /* illegal value */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003145
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003146 if (ch < maplen)
3147 x = mapstring[ch];
Guido van Rossumd57fd912000-03-10 22:53:23 +00003148
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003149 if (x == 0xfffe) {
3150 /* undefined mapping */
3151 outpos = p-PyUnicode_AS_UNICODE(v);
3152 startinpos = s-starts;
3153 endinpos = startinpos+1;
3154 if (unicode_decode_call_errorhandler(
3155 errors, &errorHandler,
3156 "charmap", "character maps to <undefined>",
3157 starts, size, &startinpos, &endinpos, &exc, &s,
3158 (PyObject **)&v, &outpos, &p)) {
3159 goto onError;
Marc-André Lemburgec233e52001-01-06 14:59:58 +00003160 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003161 continue;
Marc-André Lemburgec233e52001-01-06 14:59:58 +00003162 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003163 *p++ = x;
3164 ++s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003165 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003166 }
3167 else {
3168 while (s < e) {
3169 unsigned char ch = *s;
3170 PyObject *w, *x;
3171
3172 /* Get mapping (char ordinal -> integer, Unicode char or None) */
3173 w = PyInt_FromLong((long)ch);
3174 if (w == NULL)
3175 goto onError;
3176 x = PyObject_GetItem(mapping, w);
3177 Py_DECREF(w);
3178 if (x == NULL) {
3179 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
3180 /* No mapping found means: mapping is undefined. */
3181 PyErr_Clear();
3182 x = Py_None;
3183 Py_INCREF(x);
3184 } else
3185 goto onError;
3186 }
3187
3188 /* Apply mapping */
3189 if (PyInt_Check(x)) {
3190 long value = PyInt_AS_LONG(x);
3191 if (value < 0 || value > 65535) {
3192 PyErr_SetString(PyExc_TypeError,
3193 "character mapping must be in range(65536)");
3194 Py_DECREF(x);
3195 goto onError;
3196 }
3197 *p++ = (Py_UNICODE)value;
3198 }
3199 else if (x == Py_None) {
3200 /* undefined mapping */
3201 outpos = p-PyUnicode_AS_UNICODE(v);
3202 startinpos = s-starts;
3203 endinpos = startinpos+1;
3204 if (unicode_decode_call_errorhandler(
3205 errors, &errorHandler,
3206 "charmap", "character maps to <undefined>",
3207 starts, size, &startinpos, &endinpos, &exc, &s,
3208 (PyObject **)&v, &outpos, &p)) {
3209 Py_DECREF(x);
3210 goto onError;
3211 }
Walter Dörwaldd4fff172005-11-28 22:15:56 +00003212 Py_DECREF(x);
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003213 continue;
3214 }
3215 else if (PyUnicode_Check(x)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00003216 Py_ssize_t targetsize = PyUnicode_GET_SIZE(x);
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003217
3218 if (targetsize == 1)
3219 /* 1-1 mapping */
3220 *p++ = *PyUnicode_AS_UNICODE(x);
3221
3222 else if (targetsize > 1) {
3223 /* 1-n mapping */
3224 if (targetsize > extrachars) {
3225 /* resize first */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003226 Py_ssize_t oldpos = p - PyUnicode_AS_UNICODE(v);
3227 Py_ssize_t needed = (targetsize - extrachars) + \
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003228 (targetsize << 2);
3229 extrachars += needed;
Thomas Wouters89f507f2006-12-13 04:49:30 +00003230 /* XXX overflow detection missing */
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003231 if (_PyUnicode_Resize(&v,
3232 PyUnicode_GET_SIZE(v) + needed) < 0) {
3233 Py_DECREF(x);
3234 goto onError;
3235 }
3236 p = PyUnicode_AS_UNICODE(v) + oldpos;
3237 }
3238 Py_UNICODE_COPY(p,
3239 PyUnicode_AS_UNICODE(x),
3240 targetsize);
3241 p += targetsize;
3242 extrachars -= targetsize;
3243 }
3244 /* 1-0 mapping: skip the character */
3245 }
3246 else {
3247 /* wrong return value */
3248 PyErr_SetString(PyExc_TypeError,
3249 "character mapping must return integer, None or unicode");
3250 Py_DECREF(x);
3251 goto onError;
3252 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003253 Py_DECREF(x);
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003254 ++s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003255 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003256 }
3257 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003258 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003259 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003260 Py_XDECREF(errorHandler);
3261 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003262 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00003263
Guido van Rossumd57fd912000-03-10 22:53:23 +00003264 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003265 Py_XDECREF(errorHandler);
3266 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003267 Py_XDECREF(v);
3268 return NULL;
3269}
3270
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003271/* Charmap encoding: the lookup table */
3272
3273struct encoding_map{
3274 PyObject_HEAD
3275 unsigned char level1[32];
3276 int count2, count3;
3277 unsigned char level23[1];
3278};
3279
3280static PyObject*
3281encoding_map_size(PyObject *obj, PyObject* args)
3282{
3283 struct encoding_map *map = (struct encoding_map*)obj;
3284 return PyInt_FromLong(sizeof(*map) - 1 + 16*map->count2 +
3285 128*map->count3);
3286}
3287
3288static PyMethodDef encoding_map_methods[] = {
3289 {"size", encoding_map_size, METH_NOARGS,
3290 PyDoc_STR("Return the size (in bytes) of this object") },
3291 { 0 }
3292};
3293
3294static void
3295encoding_map_dealloc(PyObject* o)
3296{
3297 PyObject_FREE(o);
3298}
3299
3300static PyTypeObject EncodingMapType = {
3301 PyObject_HEAD_INIT(NULL)
3302 0, /*ob_size*/
3303 "EncodingMap", /*tp_name*/
3304 sizeof(struct encoding_map), /*tp_basicsize*/
3305 0, /*tp_itemsize*/
3306 /* methods */
3307 encoding_map_dealloc, /*tp_dealloc*/
3308 0, /*tp_print*/
3309 0, /*tp_getattr*/
3310 0, /*tp_setattr*/
3311 0, /*tp_compare*/
3312 0, /*tp_repr*/
3313 0, /*tp_as_number*/
3314 0, /*tp_as_sequence*/
3315 0, /*tp_as_mapping*/
3316 0, /*tp_hash*/
3317 0, /*tp_call*/
3318 0, /*tp_str*/
3319 0, /*tp_getattro*/
3320 0, /*tp_setattro*/
3321 0, /*tp_as_buffer*/
3322 Py_TPFLAGS_DEFAULT, /*tp_flags*/
3323 0, /*tp_doc*/
3324 0, /*tp_traverse*/
3325 0, /*tp_clear*/
3326 0, /*tp_richcompare*/
3327 0, /*tp_weaklistoffset*/
3328 0, /*tp_iter*/
3329 0, /*tp_iternext*/
3330 encoding_map_methods, /*tp_methods*/
3331 0, /*tp_members*/
3332 0, /*tp_getset*/
3333 0, /*tp_base*/
3334 0, /*tp_dict*/
3335 0, /*tp_descr_get*/
3336 0, /*tp_descr_set*/
3337 0, /*tp_dictoffset*/
3338 0, /*tp_init*/
3339 0, /*tp_alloc*/
3340 0, /*tp_new*/
3341 0, /*tp_free*/
3342 0, /*tp_is_gc*/
3343};
3344
3345PyObject*
3346PyUnicode_BuildEncodingMap(PyObject* string)
3347{
3348 Py_UNICODE *decode;
3349 PyObject *result;
3350 struct encoding_map *mresult;
3351 int i;
3352 int need_dict = 0;
3353 unsigned char level1[32];
3354 unsigned char level2[512];
3355 unsigned char *mlevel1, *mlevel2, *mlevel3;
3356 int count2 = 0, count3 = 0;
3357
3358 if (!PyUnicode_Check(string) || PyUnicode_GetSize(string) != 256) {
3359 PyErr_BadArgument();
3360 return NULL;
3361 }
3362 decode = PyUnicode_AS_UNICODE(string);
3363 memset(level1, 0xFF, sizeof level1);
3364 memset(level2, 0xFF, sizeof level2);
3365
3366 /* If there isn't a one-to-one mapping of NULL to \0,
3367 or if there are non-BMP characters, we need to use
3368 a mapping dictionary. */
3369 if (decode[0] != 0)
3370 need_dict = 1;
3371 for (i = 1; i < 256; i++) {
3372 int l1, l2;
3373 if (decode[i] == 0
3374 #ifdef Py_UNICODE_WIDE
3375 || decode[i] > 0xFFFF
3376 #endif
3377 ) {
3378 need_dict = 1;
3379 break;
3380 }
3381 if (decode[i] == 0xFFFE)
3382 /* unmapped character */
3383 continue;
3384 l1 = decode[i] >> 11;
3385 l2 = decode[i] >> 7;
3386 if (level1[l1] == 0xFF)
3387 level1[l1] = count2++;
3388 if (level2[l2] == 0xFF)
3389 level2[l2] = count3++;
3390 }
3391
3392 if (count2 >= 0xFF || count3 >= 0xFF)
3393 need_dict = 1;
3394
3395 if (need_dict) {
3396 PyObject *result = PyDict_New();
3397 PyObject *key, *value;
3398 if (!result)
3399 return NULL;
3400 for (i = 0; i < 256; i++) {
3401 key = value = NULL;
3402 key = PyInt_FromLong(decode[i]);
3403 value = PyInt_FromLong(i);
3404 if (!key || !value)
3405 goto failed1;
3406 if (PyDict_SetItem(result, key, value) == -1)
3407 goto failed1;
3408 Py_DECREF(key);
3409 Py_DECREF(value);
3410 }
3411 return result;
3412 failed1:
3413 Py_XDECREF(key);
3414 Py_XDECREF(value);
3415 Py_DECREF(result);
3416 return NULL;
3417 }
3418
3419 /* Create a three-level trie */
3420 result = PyObject_MALLOC(sizeof(struct encoding_map) +
3421 16*count2 + 128*count3 - 1);
3422 if (!result)
3423 return PyErr_NoMemory();
3424 PyObject_Init(result, &EncodingMapType);
3425 mresult = (struct encoding_map*)result;
3426 mresult->count2 = count2;
3427 mresult->count3 = count3;
3428 mlevel1 = mresult->level1;
3429 mlevel2 = mresult->level23;
3430 mlevel3 = mresult->level23 + 16*count2;
3431 memcpy(mlevel1, level1, 32);
3432 memset(mlevel2, 0xFF, 16*count2);
3433 memset(mlevel3, 0, 128*count3);
3434 count3 = 0;
3435 for (i = 1; i < 256; i++) {
3436 int o1, o2, o3, i2, i3;
3437 if (decode[i] == 0xFFFE)
3438 /* unmapped character */
3439 continue;
3440 o1 = decode[i]>>11;
3441 o2 = (decode[i]>>7) & 0xF;
3442 i2 = 16*mlevel1[o1] + o2;
3443 if (mlevel2[i2] == 0xFF)
3444 mlevel2[i2] = count3++;
3445 o3 = decode[i] & 0x7F;
3446 i3 = 128*mlevel2[i2] + o3;
3447 mlevel3[i3] = i;
3448 }
3449 return result;
3450}
3451
3452static int
3453encoding_map_lookup(Py_UNICODE c, PyObject *mapping)
3454{
3455 struct encoding_map *map = (struct encoding_map*)mapping;
3456 int l1 = c>>11;
3457 int l2 = (c>>7) & 0xF;
3458 int l3 = c & 0x7F;
3459 int i;
3460
3461#ifdef Py_UNICODE_WIDE
3462 if (c > 0xFFFF) {
3463 return -1;
3464 }
3465#endif
3466 if (c == 0)
3467 return 0;
3468 /* level 1*/
3469 i = map->level1[l1];
3470 if (i == 0xFF) {
3471 return -1;
3472 }
3473 /* level 2*/
3474 i = map->level23[16*i+l2];
3475 if (i == 0xFF) {
3476 return -1;
3477 }
3478 /* level 3 */
3479 i = map->level23[16*map->count2 + 128*i + l3];
3480 if (i == 0) {
3481 return -1;
3482 }
3483 return i;
3484}
3485
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003486/* Lookup the character ch in the mapping. If the character
3487 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00003488 error occurred). */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003489static PyObject *charmapencode_lookup(Py_UNICODE c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003490{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003491 PyObject *w = PyInt_FromLong((long)c);
3492 PyObject *x;
3493
3494 if (w == NULL)
3495 return NULL;
3496 x = PyObject_GetItem(mapping, w);
3497 Py_DECREF(w);
3498 if (x == NULL) {
3499 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
3500 /* No mapping found means: mapping is undefined. */
3501 PyErr_Clear();
3502 x = Py_None;
3503 Py_INCREF(x);
3504 return x;
3505 } else
3506 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003507 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00003508 else if (x == Py_None)
3509 return x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003510 else if (PyInt_Check(x)) {
3511 long value = PyInt_AS_LONG(x);
3512 if (value < 0 || value > 255) {
3513 PyErr_SetString(PyExc_TypeError,
3514 "character mapping must be in range(256)");
3515 Py_DECREF(x);
3516 return NULL;
3517 }
3518 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003519 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003520 else if (PyString_Check(x))
3521 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003522 else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003523 /* wrong return value */
Walter Dörwald580ceed2007-05-09 10:39:19 +00003524 PyErr_Format(PyExc_TypeError,
3525 "character mapping must return integer, None or str8, not %.400s",
3526 x->ob_type->tp_name);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003527 Py_DECREF(x);
3528 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003529 }
3530}
3531
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003532static int
3533charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
3534{
3535 Py_ssize_t outsize = PyString_GET_SIZE(*outobj);
3536 /* exponentially overallocate to minimize reallocations */
3537 if (requiredsize < 2*outsize)
3538 requiredsize = 2*outsize;
3539 if (_PyString_Resize(outobj, requiredsize)) {
3540 return 0;
3541 }
3542 return 1;
3543}
3544
3545typedef enum charmapencode_result {
3546 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
3547}charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003548/* lookup the character, put the result in the output string and adjust
3549 various state variables. Reallocate the output string if not enough
3550 space is available. Return a new reference to the object that
3551 was put in the output buffer, or Py_None, if the mapping was undefined
3552 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00003553 reallocation error occurred. The caller must decref the result */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003554static
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003555charmapencode_result charmapencode_output(Py_UNICODE c, PyObject *mapping,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003556 PyObject **outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003557{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003558 PyObject *rep;
3559 char *outstart;
3560 Py_ssize_t outsize = PyString_GET_SIZE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003561
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003562 if (mapping->ob_type == &EncodingMapType) {
3563 int res = encoding_map_lookup(c, mapping);
3564 Py_ssize_t requiredsize = *outpos+1;
3565 if (res == -1)
3566 return enc_FAILED;
3567 if (outsize<requiredsize)
3568 if (!charmapencode_resize(outobj, outpos, requiredsize))
3569 return enc_EXCEPTION;
3570 outstart = PyString_AS_STRING(*outobj);
3571 outstart[(*outpos)++] = (char)res;
3572 return enc_SUCCESS;
3573 }
3574
3575 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003576 if (rep==NULL)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003577 return enc_EXCEPTION;
3578 else if (rep==Py_None) {
3579 Py_DECREF(rep);
3580 return enc_FAILED;
3581 } else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003582 if (PyInt_Check(rep)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00003583 Py_ssize_t requiredsize = *outpos+1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003584 if (outsize<requiredsize)
3585 if (!charmapencode_resize(outobj, outpos, requiredsize)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003586 Py_DECREF(rep);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003587 return enc_EXCEPTION;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003588 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003589 outstart = PyString_AS_STRING(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003590 outstart[(*outpos)++] = (char)PyInt_AS_LONG(rep);
3591 }
3592 else {
3593 const char *repchars = PyString_AS_STRING(rep);
Martin v. Löwis18e16552006-02-15 17:27:45 +00003594 Py_ssize_t repsize = PyString_GET_SIZE(rep);
3595 Py_ssize_t requiredsize = *outpos+repsize;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003596 if (outsize<requiredsize)
3597 if (!charmapencode_resize(outobj, outpos, requiredsize)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003598 Py_DECREF(rep);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003599 return enc_EXCEPTION;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003600 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003601 outstart = PyString_AS_STRING(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003602 memcpy(outstart + *outpos, repchars, repsize);
3603 *outpos += repsize;
3604 }
3605 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003606 Py_DECREF(rep);
3607 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003608}
3609
3610/* handle an error in PyUnicode_EncodeCharmap
3611 Return 0 on success, -1 on error */
3612static
3613int charmap_encoding_error(
Martin v. Löwis18e16552006-02-15 17:27:45 +00003614 const Py_UNICODE *p, Py_ssize_t size, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003615 PyObject **exceptionObject,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00003616 int *known_errorHandler, PyObject **errorHandler, const char *errors,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003617 PyObject **res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003618{
3619 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003620 Py_ssize_t repsize;
3621 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003622 Py_UNICODE *uni2;
3623 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003624 Py_ssize_t collstartpos = *inpos;
3625 Py_ssize_t collendpos = *inpos+1;
3626 Py_ssize_t collpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003627 char *encoding = "charmap";
3628 char *reason = "character maps to <undefined>";
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003629 charmapencode_result x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003630
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003631 /* find all unencodable characters */
3632 while (collendpos < size) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003633 PyObject *rep;
3634 if (mapping->ob_type == &EncodingMapType) {
3635 int res = encoding_map_lookup(p[collendpos], mapping);
3636 if (res != -1)
3637 break;
3638 ++collendpos;
3639 continue;
3640 }
3641
3642 rep = charmapencode_lookup(p[collendpos], mapping);
3643 if (rep==NULL)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003644 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003645 else if (rep!=Py_None) {
3646 Py_DECREF(rep);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003647 break;
3648 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003649 Py_DECREF(rep);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003650 ++collendpos;
3651 }
3652 /* cache callback name lookup
3653 * (if not done yet, i.e. it's the first error) */
3654 if (*known_errorHandler==-1) {
3655 if ((errors==NULL) || (!strcmp(errors, "strict")))
3656 *known_errorHandler = 1;
3657 else if (!strcmp(errors, "replace"))
3658 *known_errorHandler = 2;
3659 else if (!strcmp(errors, "ignore"))
3660 *known_errorHandler = 3;
3661 else if (!strcmp(errors, "xmlcharrefreplace"))
3662 *known_errorHandler = 4;
3663 else
3664 *known_errorHandler = 0;
3665 }
3666 switch (*known_errorHandler) {
3667 case 1: /* strict */
3668 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
3669 return -1;
3670 case 2: /* replace */
3671 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
3672 x = charmapencode_output('?', mapping, res, respos);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003673 if (x==enc_EXCEPTION) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003674 return -1;
3675 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003676 else if (x==enc_FAILED) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003677 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
3678 return -1;
3679 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003680 }
3681 /* fall through */
3682 case 3: /* ignore */
3683 *inpos = collendpos;
3684 break;
3685 case 4: /* xmlcharrefreplace */
3686 /* generate replacement (temporarily (mis)uses p) */
3687 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
3688 char buffer[2+29+1+1];
3689 char *cp;
3690 sprintf(buffer, "&#%d;", (int)p[collpos]);
3691 for (cp = buffer; *cp; ++cp) {
3692 x = charmapencode_output(*cp, mapping, res, respos);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003693 if (x==enc_EXCEPTION)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003694 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003695 else if (x==enc_FAILED) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003696 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
3697 return -1;
3698 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003699 }
3700 }
3701 *inpos = collendpos;
3702 break;
3703 default:
Walter Dörwalde5402fb2003-08-14 20:25:29 +00003704 repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003705 encoding, reason, p, size, exceptionObject,
3706 collstartpos, collendpos, &newpos);
3707 if (repunicode == NULL)
3708 return -1;
3709 /* generate replacement */
3710 repsize = PyUnicode_GET_SIZE(repunicode);
3711 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
3712 x = charmapencode_output(*uni2, mapping, res, respos);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003713 if (x==enc_EXCEPTION) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003714 return -1;
3715 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003716 else if (x==enc_FAILED) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003717 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003718 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
3719 return -1;
3720 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003721 }
3722 *inpos = newpos;
3723 Py_DECREF(repunicode);
3724 }
3725 return 0;
3726}
3727
Guido van Rossumd57fd912000-03-10 22:53:23 +00003728PyObject *PyUnicode_EncodeCharmap(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003729 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003730 PyObject *mapping,
3731 const char *errors)
3732{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003733 /* output object */
3734 PyObject *res = NULL;
3735 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003736 Py_ssize_t inpos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003737 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003738 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003739 PyObject *errorHandler = NULL;
3740 PyObject *exc = NULL;
3741 /* the following variable is used for caching string comparisons
3742 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
3743 * 3=ignore, 4=xmlcharrefreplace */
3744 int known_errorHandler = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003745
3746 /* Default to Latin-1 */
3747 if (mapping == NULL)
3748 return PyUnicode_EncodeLatin1(p, size, errors);
3749
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003750 /* allocate enough for a simple encoding without
3751 replacements, if we need more, we'll resize */
3752 res = PyString_FromStringAndSize(NULL, size);
3753 if (res == NULL)
3754 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00003755 if (size == 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003756 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003757
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003758 while (inpos<size) {
3759 /* try to encode it */
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003760 charmapencode_result x = charmapencode_output(p[inpos], mapping, &res, &respos);
3761 if (x==enc_EXCEPTION) /* error */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003762 goto onError;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003763 if (x==enc_FAILED) { /* unencodable character */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003764 if (charmap_encoding_error(p, size, &inpos, mapping,
3765 &exc,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00003766 &known_errorHandler, &errorHandler, errors,
Walter Dörwald9b30f202003-08-15 16:26:34 +00003767 &res, &respos)) {
Marc-André Lemburg3a645e42001-01-16 11:54:12 +00003768 goto onError;
Walter Dörwald9b30f202003-08-15 16:26:34 +00003769 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003770 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003771 else
3772 /* done with this character => adjust input position */
3773 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003774 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003775
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003776 /* Resize if we allocated to much */
3777 if (respos<PyString_GET_SIZE(res)) {
3778 if (_PyString_Resize(&res, respos))
3779 goto onError;
3780 }
3781 Py_XDECREF(exc);
3782 Py_XDECREF(errorHandler);
3783 return res;
3784
3785 onError:
3786 Py_XDECREF(res);
3787 Py_XDECREF(exc);
3788 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003789 return NULL;
3790}
3791
3792PyObject *PyUnicode_AsCharmapString(PyObject *unicode,
3793 PyObject *mapping)
3794{
3795 if (!PyUnicode_Check(unicode) || mapping == NULL) {
3796 PyErr_BadArgument();
3797 return NULL;
3798 }
3799 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
3800 PyUnicode_GET_SIZE(unicode),
3801 mapping,
3802 NULL);
3803}
3804
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003805/* create or adjust a UnicodeTranslateError */
3806static void make_translate_exception(PyObject **exceptionObject,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003807 const Py_UNICODE *unicode, Py_ssize_t size,
3808 Py_ssize_t startpos, Py_ssize_t endpos,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003809 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003810{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003811 if (*exceptionObject == NULL) {
3812 *exceptionObject = PyUnicodeTranslateError_Create(
3813 unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003814 }
3815 else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003816 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
3817 goto onError;
3818 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
3819 goto onError;
3820 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
3821 goto onError;
3822 return;
3823 onError:
3824 Py_DECREF(*exceptionObject);
3825 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003826 }
3827}
3828
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003829/* raises a UnicodeTranslateError */
3830static void raise_translate_exception(PyObject **exceptionObject,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003831 const Py_UNICODE *unicode, Py_ssize_t size,
3832 Py_ssize_t startpos, Py_ssize_t endpos,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003833 const char *reason)
3834{
3835 make_translate_exception(exceptionObject,
3836 unicode, size, startpos, endpos, reason);
3837 if (*exceptionObject != NULL)
3838 PyCodec_StrictErrors(*exceptionObject);
3839}
3840
3841/* error handling callback helper:
3842 build arguments, call the callback and check the arguments,
3843 put the result into newpos and return the replacement string, which
3844 has to be freed by the caller */
3845static PyObject *unicode_translate_call_errorhandler(const char *errors,
3846 PyObject **errorHandler,
3847 const char *reason,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003848 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
3849 Py_ssize_t startpos, Py_ssize_t endpos,
3850 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003851{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003852 static char *argparse = "O!n;translating error handler must return (unicode, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003853
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003854 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003855 PyObject *restuple;
3856 PyObject *resunicode;
3857
3858 if (*errorHandler == NULL) {
3859 *errorHandler = PyCodec_LookupError(errors);
3860 if (*errorHandler == NULL)
3861 return NULL;
3862 }
3863
3864 make_translate_exception(exceptionObject,
3865 unicode, size, startpos, endpos, reason);
3866 if (*exceptionObject == NULL)
3867 return NULL;
3868
3869 restuple = PyObject_CallFunctionObjArgs(
3870 *errorHandler, *exceptionObject, NULL);
3871 if (restuple == NULL)
3872 return NULL;
3873 if (!PyTuple_Check(restuple)) {
3874 PyErr_Format(PyExc_TypeError, &argparse[4]);
3875 Py_DECREF(restuple);
3876 return NULL;
3877 }
3878 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003879 &resunicode, &i_newpos)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003880 Py_DECREF(restuple);
3881 return NULL;
3882 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00003883 if (i_newpos<0)
3884 *newpos = size+i_newpos;
3885 else
3886 *newpos = i_newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003887 if (*newpos<0 || *newpos>size) {
Martin v. Löwis2c95cc62006-02-16 06:54:25 +00003888 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003889 Py_DECREF(restuple);
3890 return NULL;
3891 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003892 Py_INCREF(resunicode);
3893 Py_DECREF(restuple);
3894 return resunicode;
3895}
3896
3897/* Lookup the character ch in the mapping and put the result in result,
3898 which must be decrefed by the caller.
3899 Return 0 on success, -1 on error */
3900static
3901int charmaptranslate_lookup(Py_UNICODE c, PyObject *mapping, PyObject **result)
3902{
3903 PyObject *w = PyInt_FromLong((long)c);
3904 PyObject *x;
3905
3906 if (w == NULL)
3907 return -1;
3908 x = PyObject_GetItem(mapping, w);
3909 Py_DECREF(w);
3910 if (x == NULL) {
3911 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
3912 /* No mapping found means: use 1:1 mapping. */
3913 PyErr_Clear();
3914 *result = NULL;
3915 return 0;
3916 } else
3917 return -1;
3918 }
3919 else if (x == Py_None) {
3920 *result = x;
3921 return 0;
3922 }
3923 else if (PyInt_Check(x)) {
3924 long value = PyInt_AS_LONG(x);
3925 long max = PyUnicode_GetMax();
3926 if (value < 0 || value > max) {
3927 PyErr_Format(PyExc_TypeError,
3928 "character mapping must be in range(0x%lx)", max+1);
3929 Py_DECREF(x);
3930 return -1;
3931 }
3932 *result = x;
3933 return 0;
3934 }
3935 else if (PyUnicode_Check(x)) {
3936 *result = x;
3937 return 0;
3938 }
3939 else {
3940 /* wrong return value */
3941 PyErr_SetString(PyExc_TypeError,
3942 "character mapping must return integer, None or unicode");
Walter Dörwald150523e2003-08-15 16:52:19 +00003943 Py_DECREF(x);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003944 return -1;
3945 }
3946}
3947/* ensure that *outobj is at least requiredsize characters long,
3948if not reallocate and adjust various state variables.
3949Return 0 on success, -1 on error */
3950static
Walter Dörwald4894c302003-10-24 14:25:28 +00003951int charmaptranslate_makespace(PyObject **outobj, Py_UNICODE **outp,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003952 Py_ssize_t requiredsize)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003953{
Martin v. Löwis18e16552006-02-15 17:27:45 +00003954 Py_ssize_t oldsize = PyUnicode_GET_SIZE(*outobj);
Walter Dörwald4894c302003-10-24 14:25:28 +00003955 if (requiredsize > oldsize) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003956 /* remember old output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003957 Py_ssize_t outpos = *outp-PyUnicode_AS_UNICODE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003958 /* exponentially overallocate to minimize reallocations */
Walter Dörwald4894c302003-10-24 14:25:28 +00003959 if (requiredsize < 2 * oldsize)
3960 requiredsize = 2 * oldsize;
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00003961 if (_PyUnicode_Resize(outobj, requiredsize) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003962 return -1;
3963 *outp = PyUnicode_AS_UNICODE(*outobj) + outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003964 }
3965 return 0;
3966}
3967/* lookup the character, put the result in the output string and adjust
3968 various state variables. Return a new reference to the object that
3969 was put in the output buffer in *result, or Py_None, if the mapping was
3970 undefined (in which case no character was written).
3971 The called must decref result.
3972 Return 0 on success, -1 on error. */
3973static
Walter Dörwald4894c302003-10-24 14:25:28 +00003974int charmaptranslate_output(const Py_UNICODE *startinp, const Py_UNICODE *curinp,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003975 Py_ssize_t insize, PyObject *mapping, PyObject **outobj, Py_UNICODE **outp,
Walter Dörwald4894c302003-10-24 14:25:28 +00003976 PyObject **res)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003977{
Walter Dörwald4894c302003-10-24 14:25:28 +00003978 if (charmaptranslate_lookup(*curinp, mapping, res))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003979 return -1;
3980 if (*res==NULL) {
3981 /* not found => default to 1:1 mapping */
Walter Dörwald4894c302003-10-24 14:25:28 +00003982 *(*outp)++ = *curinp;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003983 }
3984 else if (*res==Py_None)
3985 ;
3986 else if (PyInt_Check(*res)) {
3987 /* no overflow check, because we know that the space is enough */
3988 *(*outp)++ = (Py_UNICODE)PyInt_AS_LONG(*res);
3989 }
3990 else if (PyUnicode_Check(*res)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00003991 Py_ssize_t repsize = PyUnicode_GET_SIZE(*res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003992 if (repsize==1) {
3993 /* no overflow check, because we know that the space is enough */
3994 *(*outp)++ = *PyUnicode_AS_UNICODE(*res);
3995 }
3996 else if (repsize!=0) {
3997 /* more than one character */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003998 Py_ssize_t requiredsize = (*outp-PyUnicode_AS_UNICODE(*outobj)) +
Walter Dörwaldcd736e72004-02-05 17:36:00 +00003999 (insize - (curinp-startinp)) +
Walter Dörwald4894c302003-10-24 14:25:28 +00004000 repsize - 1;
4001 if (charmaptranslate_makespace(outobj, outp, requiredsize))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004002 return -1;
4003 memcpy(*outp, PyUnicode_AS_UNICODE(*res), sizeof(Py_UNICODE)*repsize);
4004 *outp += repsize;
4005 }
4006 }
4007 else
4008 return -1;
4009 return 0;
4010}
4011
4012PyObject *PyUnicode_TranslateCharmap(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004013 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004014 PyObject *mapping,
4015 const char *errors)
4016{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004017 /* output object */
4018 PyObject *res = NULL;
4019 /* pointers to the beginning and end+1 of input */
4020 const Py_UNICODE *startp = p;
4021 const Py_UNICODE *endp = p + size;
4022 /* pointer into the output */
4023 Py_UNICODE *str;
4024 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004025 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004026 char *reason = "character maps to <undefined>";
4027 PyObject *errorHandler = NULL;
4028 PyObject *exc = NULL;
4029 /* the following variable is used for caching string comparisons
4030 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
4031 * 3=ignore, 4=xmlcharrefreplace */
4032 int known_errorHandler = -1;
4033
Guido van Rossumd57fd912000-03-10 22:53:23 +00004034 if (mapping == NULL) {
4035 PyErr_BadArgument();
4036 return NULL;
4037 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004038
4039 /* allocate enough for a simple 1:1 translation without
4040 replacements, if we need more, we'll resize */
4041 res = PyUnicode_FromUnicode(NULL, size);
4042 if (res == NULL)
Walter Dörwald4894c302003-10-24 14:25:28 +00004043 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004044 if (size == 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004045 return res;
4046 str = PyUnicode_AS_UNICODE(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004047
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004048 while (p<endp) {
4049 /* try to encode it */
4050 PyObject *x = NULL;
Walter Dörwald4894c302003-10-24 14:25:28 +00004051 if (charmaptranslate_output(startp, p, size, mapping, &res, &str, &x)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004052 Py_XDECREF(x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004053 goto onError;
4054 }
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00004055 Py_XDECREF(x);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004056 if (x!=Py_None) /* it worked => adjust input pointer */
4057 ++p;
4058 else { /* untranslatable character */
4059 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004060 Py_ssize_t repsize;
4061 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004062 Py_UNICODE *uni2;
4063 /* startpos for collecting untranslatable chars */
4064 const Py_UNICODE *collstart = p;
4065 const Py_UNICODE *collend = p+1;
4066 const Py_UNICODE *coll;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004067
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004068 /* find all untranslatable characters */
4069 while (collend < endp) {
Walter Dörwald4894c302003-10-24 14:25:28 +00004070 if (charmaptranslate_lookup(*collend, mapping, &x))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004071 goto onError;
4072 Py_XDECREF(x);
4073 if (x!=Py_None)
4074 break;
4075 ++collend;
4076 }
4077 /* cache callback name lookup
4078 * (if not done yet, i.e. it's the first error) */
4079 if (known_errorHandler==-1) {
4080 if ((errors==NULL) || (!strcmp(errors, "strict")))
4081 known_errorHandler = 1;
4082 else if (!strcmp(errors, "replace"))
4083 known_errorHandler = 2;
4084 else if (!strcmp(errors, "ignore"))
4085 known_errorHandler = 3;
4086 else if (!strcmp(errors, "xmlcharrefreplace"))
4087 known_errorHandler = 4;
4088 else
4089 known_errorHandler = 0;
4090 }
4091 switch (known_errorHandler) {
4092 case 1: /* strict */
4093 raise_translate_exception(&exc, startp, size, collstart-startp, collend-startp, reason);
4094 goto onError;
4095 case 2: /* replace */
4096 /* No need to check for space, this is a 1:1 replacement */
4097 for (coll = collstart; coll<collend; ++coll)
4098 *str++ = '?';
4099 /* fall through */
4100 case 3: /* ignore */
4101 p = collend;
4102 break;
4103 case 4: /* xmlcharrefreplace */
4104 /* generate replacement (temporarily (mis)uses p) */
4105 for (p = collstart; p < collend; ++p) {
4106 char buffer[2+29+1+1];
4107 char *cp;
4108 sprintf(buffer, "&#%d;", (int)*p);
Walter Dörwald4894c302003-10-24 14:25:28 +00004109 if (charmaptranslate_makespace(&res, &str,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004110 (str-PyUnicode_AS_UNICODE(res))+strlen(buffer)+(endp-collend)))
4111 goto onError;
4112 for (cp = buffer; *cp; ++cp)
4113 *str++ = *cp;
4114 }
4115 p = collend;
4116 break;
4117 default:
4118 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
4119 reason, startp, size, &exc,
4120 collstart-startp, collend-startp, &newpos);
4121 if (repunicode == NULL)
4122 goto onError;
4123 /* generate replacement */
4124 repsize = PyUnicode_GET_SIZE(repunicode);
Walter Dörwald4894c302003-10-24 14:25:28 +00004125 if (charmaptranslate_makespace(&res, &str,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004126 (str-PyUnicode_AS_UNICODE(res))+repsize+(endp-collend))) {
4127 Py_DECREF(repunicode);
4128 goto onError;
4129 }
4130 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2)
4131 *str++ = *uni2;
4132 p = startp + newpos;
4133 Py_DECREF(repunicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004134 }
4135 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004136 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004137 /* Resize if we allocated to much */
4138 respos = str-PyUnicode_AS_UNICODE(res);
Walter Dörwald4894c302003-10-24 14:25:28 +00004139 if (respos<PyUnicode_GET_SIZE(res)) {
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00004140 if (_PyUnicode_Resize(&res, respos) < 0)
Guido van Rossumfd4b9572000-04-10 13:51:10 +00004141 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004142 }
4143 Py_XDECREF(exc);
4144 Py_XDECREF(errorHandler);
4145 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004146
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004147 onError:
4148 Py_XDECREF(res);
4149 Py_XDECREF(exc);
4150 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004151 return NULL;
4152}
4153
4154PyObject *PyUnicode_Translate(PyObject *str,
4155 PyObject *mapping,
4156 const char *errors)
4157{
4158 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00004159
Guido van Rossumd57fd912000-03-10 22:53:23 +00004160 str = PyUnicode_FromObject(str);
4161 if (str == NULL)
4162 goto onError;
4163 result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str),
4164 PyUnicode_GET_SIZE(str),
4165 mapping,
4166 errors);
4167 Py_DECREF(str);
4168 return result;
Tim Petersced69f82003-09-16 20:30:58 +00004169
Guido van Rossumd57fd912000-03-10 22:53:23 +00004170 onError:
4171 Py_XDECREF(str);
4172 return NULL;
4173}
Tim Petersced69f82003-09-16 20:30:58 +00004174
Guido van Rossum9e896b32000-04-05 20:11:21 +00004175/* --- Decimal Encoder ---------------------------------------------------- */
4176
4177int PyUnicode_EncodeDecimal(Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004178 Py_ssize_t length,
Guido van Rossum9e896b32000-04-05 20:11:21 +00004179 char *output,
4180 const char *errors)
4181{
4182 Py_UNICODE *p, *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004183 PyObject *errorHandler = NULL;
4184 PyObject *exc = NULL;
4185 const char *encoding = "decimal";
4186 const char *reason = "invalid decimal Unicode string";
4187 /* the following variable is used for caching string comparisons
4188 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
4189 int known_errorHandler = -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00004190
4191 if (output == NULL) {
4192 PyErr_BadArgument();
4193 return -1;
4194 }
4195
4196 p = s;
4197 end = s + length;
4198 while (p < end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004199 register Py_UNICODE ch = *p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00004200 int decimal;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004201 PyObject *repunicode;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004202 Py_ssize_t repsize;
4203 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004204 Py_UNICODE *uni2;
4205 Py_UNICODE *collstart;
4206 Py_UNICODE *collend;
Tim Petersced69f82003-09-16 20:30:58 +00004207
Guido van Rossum9e896b32000-04-05 20:11:21 +00004208 if (Py_UNICODE_ISSPACE(ch)) {
4209 *output++ = ' ';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004210 ++p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00004211 continue;
4212 }
4213 decimal = Py_UNICODE_TODECIMAL(ch);
4214 if (decimal >= 0) {
4215 *output++ = '0' + decimal;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004216 ++p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00004217 continue;
4218 }
Guido van Rossumba477042000-04-06 18:18:10 +00004219 if (0 < ch && ch < 256) {
Guido van Rossum42c29aa2000-05-03 23:58:29 +00004220 *output++ = (char)ch;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004221 ++p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00004222 continue;
4223 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004224 /* All other characters are considered unencodable */
4225 collstart = p;
4226 collend = p+1;
4227 while (collend < end) {
4228 if ((0 < *collend && *collend < 256) ||
4229 !Py_UNICODE_ISSPACE(*collend) ||
4230 Py_UNICODE_TODECIMAL(*collend))
4231 break;
Guido van Rossum9e896b32000-04-05 20:11:21 +00004232 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004233 /* cache callback name lookup
4234 * (if not done yet, i.e. it's the first error) */
4235 if (known_errorHandler==-1) {
4236 if ((errors==NULL) || (!strcmp(errors, "strict")))
4237 known_errorHandler = 1;
4238 else if (!strcmp(errors, "replace"))
4239 known_errorHandler = 2;
4240 else if (!strcmp(errors, "ignore"))
4241 known_errorHandler = 3;
4242 else if (!strcmp(errors, "xmlcharrefreplace"))
4243 known_errorHandler = 4;
4244 else
4245 known_errorHandler = 0;
4246 }
4247 switch (known_errorHandler) {
4248 case 1: /* strict */
4249 raise_encode_exception(&exc, encoding, s, length, collstart-s, collend-s, reason);
4250 goto onError;
4251 case 2: /* replace */
4252 for (p = collstart; p < collend; ++p)
4253 *output++ = '?';
4254 /* fall through */
4255 case 3: /* ignore */
4256 p = collend;
4257 break;
4258 case 4: /* xmlcharrefreplace */
4259 /* generate replacement (temporarily (mis)uses p) */
4260 for (p = collstart; p < collend; ++p)
4261 output += sprintf(output, "&#%d;", (int)*p);
4262 p = collend;
4263 break;
4264 default:
4265 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
4266 encoding, reason, s, length, &exc,
4267 collstart-s, collend-s, &newpos);
4268 if (repunicode == NULL)
4269 goto onError;
4270 /* generate replacement */
4271 repsize = PyUnicode_GET_SIZE(repunicode);
4272 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
4273 Py_UNICODE ch = *uni2;
4274 if (Py_UNICODE_ISSPACE(ch))
4275 *output++ = ' ';
4276 else {
4277 decimal = Py_UNICODE_TODECIMAL(ch);
4278 if (decimal >= 0)
4279 *output++ = '0' + decimal;
4280 else if (0 < ch && ch < 256)
4281 *output++ = (char)ch;
4282 else {
4283 Py_DECREF(repunicode);
4284 raise_encode_exception(&exc, encoding,
4285 s, length, collstart-s, collend-s, reason);
4286 goto onError;
4287 }
4288 }
4289 }
4290 p = s + newpos;
4291 Py_DECREF(repunicode);
Guido van Rossum9e896b32000-04-05 20:11:21 +00004292 }
4293 }
4294 /* 0-terminate the output string */
4295 *output++ = '\0';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004296 Py_XDECREF(exc);
4297 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00004298 return 0;
4299
4300 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004301 Py_XDECREF(exc);
4302 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00004303 return -1;
4304}
4305
Guido van Rossumd57fd912000-03-10 22:53:23 +00004306/* --- Helpers ------------------------------------------------------------ */
4307
Thomas Wouters477c8d52006-05-27 19:21:47 +00004308#define STRINGLIB_CHAR Py_UNICODE
4309
4310#define STRINGLIB_LEN PyUnicode_GET_SIZE
4311#define STRINGLIB_NEW PyUnicode_FromUnicode
4312#define STRINGLIB_STR PyUnicode_AS_UNICODE
4313
4314Py_LOCAL_INLINE(int)
4315STRINGLIB_CMP(const Py_UNICODE* str, const Py_UNICODE* other, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004316{
Thomas Wouters477c8d52006-05-27 19:21:47 +00004317 if (str[0] != other[0])
4318 return 1;
4319 return memcmp((void*) str, (void*) other, len * sizeof(Py_UNICODE));
Guido van Rossumd57fd912000-03-10 22:53:23 +00004320}
4321
Thomas Wouters477c8d52006-05-27 19:21:47 +00004322#define STRINGLIB_EMPTY unicode_empty
4323
4324#include "stringlib/fastsearch.h"
4325
4326#include "stringlib/count.h"
4327#include "stringlib/find.h"
4328#include "stringlib/partition.h"
4329
4330/* helper macro to fixup start/end slice values */
4331#define FIX_START_END(obj) \
4332 if (start < 0) \
4333 start += (obj)->length; \
4334 if (start < 0) \
4335 start = 0; \
4336 if (end > (obj)->length) \
4337 end = (obj)->length; \
4338 if (end < 0) \
4339 end += (obj)->length; \
4340 if (end < 0) \
4341 end = 0;
4342
Martin v. Löwis18e16552006-02-15 17:27:45 +00004343Py_ssize_t PyUnicode_Count(PyObject *str,
Thomas Wouters477c8d52006-05-27 19:21:47 +00004344 PyObject *substr,
4345 Py_ssize_t start,
4346 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004347{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004348 Py_ssize_t result;
Thomas Wouters477c8d52006-05-27 19:21:47 +00004349 PyUnicodeObject* str_obj;
4350 PyUnicodeObject* sub_obj;
Tim Petersced69f82003-09-16 20:30:58 +00004351
Thomas Wouters477c8d52006-05-27 19:21:47 +00004352 str_obj = (PyUnicodeObject*) PyUnicode_FromObject(str);
4353 if (!str_obj)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004354 return -1;
Thomas Wouters477c8d52006-05-27 19:21:47 +00004355 sub_obj = (PyUnicodeObject*) PyUnicode_FromObject(substr);
4356 if (!sub_obj) {
4357 Py_DECREF(str_obj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004358 return -1;
4359 }
Tim Petersced69f82003-09-16 20:30:58 +00004360
Thomas Wouters477c8d52006-05-27 19:21:47 +00004361 FIX_START_END(str_obj);
Tim Petersced69f82003-09-16 20:30:58 +00004362
Thomas Wouters477c8d52006-05-27 19:21:47 +00004363 result = stringlib_count(
4364 str_obj->str + start, end - start, sub_obj->str, sub_obj->length
4365 );
4366
4367 Py_DECREF(sub_obj);
4368 Py_DECREF(str_obj);
4369
Guido van Rossumd57fd912000-03-10 22:53:23 +00004370 return result;
4371}
4372
Martin v. Löwis18e16552006-02-15 17:27:45 +00004373Py_ssize_t PyUnicode_Find(PyObject *str,
Thomas Wouters477c8d52006-05-27 19:21:47 +00004374 PyObject *sub,
4375 Py_ssize_t start,
4376 Py_ssize_t end,
4377 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004378{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004379 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00004380
Guido van Rossumd57fd912000-03-10 22:53:23 +00004381 str = PyUnicode_FromObject(str);
Thomas Wouters477c8d52006-05-27 19:21:47 +00004382 if (!str)
Marc-André Lemburg4da6fd62002-05-29 11:33:13 +00004383 return -2;
Thomas Wouters477c8d52006-05-27 19:21:47 +00004384 sub = PyUnicode_FromObject(sub);
4385 if (!sub) {
Marc-André Lemburg41644392002-05-29 13:46:29 +00004386 Py_DECREF(str);
Marc-André Lemburg4da6fd62002-05-29 11:33:13 +00004387 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004388 }
Tim Petersced69f82003-09-16 20:30:58 +00004389
Thomas Wouters477c8d52006-05-27 19:21:47 +00004390 if (direction > 0)
4391 result = stringlib_find_slice(
4392 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
4393 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
4394 start, end
4395 );
4396 else
4397 result = stringlib_rfind_slice(
4398 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
4399 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
4400 start, end
4401 );
4402
Guido van Rossumd57fd912000-03-10 22:53:23 +00004403 Py_DECREF(str);
Thomas Wouters477c8d52006-05-27 19:21:47 +00004404 Py_DECREF(sub);
4405
Guido van Rossumd57fd912000-03-10 22:53:23 +00004406 return result;
4407}
4408
Tim Petersced69f82003-09-16 20:30:58 +00004409static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004410int tailmatch(PyUnicodeObject *self,
4411 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004412 Py_ssize_t start,
4413 Py_ssize_t end,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004414 int direction)
4415{
Guido van Rossumd57fd912000-03-10 22:53:23 +00004416 if (substring->length == 0)
4417 return 1;
4418
Thomas Wouters477c8d52006-05-27 19:21:47 +00004419 FIX_START_END(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004420
4421 end -= substring->length;
4422 if (end < start)
4423 return 0;
4424
4425 if (direction > 0) {
4426 if (Py_UNICODE_MATCH(self, end, substring))
4427 return 1;
4428 } else {
4429 if (Py_UNICODE_MATCH(self, start, substring))
4430 return 1;
4431 }
4432
4433 return 0;
4434}
4435
Martin v. Löwis18e16552006-02-15 17:27:45 +00004436Py_ssize_t PyUnicode_Tailmatch(PyObject *str,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004437 PyObject *substr,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004438 Py_ssize_t start,
4439 Py_ssize_t end,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004440 int direction)
4441{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004442 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00004443
Guido van Rossumd57fd912000-03-10 22:53:23 +00004444 str = PyUnicode_FromObject(str);
4445 if (str == NULL)
4446 return -1;
4447 substr = PyUnicode_FromObject(substr);
4448 if (substr == NULL) {
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00004449 Py_DECREF(str);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004450 return -1;
4451 }
Tim Petersced69f82003-09-16 20:30:58 +00004452
Guido van Rossumd57fd912000-03-10 22:53:23 +00004453 result = tailmatch((PyUnicodeObject *)str,
4454 (PyUnicodeObject *)substr,
4455 start, end, direction);
4456 Py_DECREF(str);
4457 Py_DECREF(substr);
4458 return result;
4459}
4460
Guido van Rossumd57fd912000-03-10 22:53:23 +00004461/* Apply fixfct filter to the Unicode object self and return a
4462 reference to the modified object */
4463
Tim Petersced69f82003-09-16 20:30:58 +00004464static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004465PyObject *fixup(PyUnicodeObject *self,
4466 int (*fixfct)(PyUnicodeObject *s))
4467{
4468
4469 PyUnicodeObject *u;
4470
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004471 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004472 if (u == NULL)
4473 return NULL;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004474
4475 Py_UNICODE_COPY(u->str, self->str, self->length);
4476
Tim Peters7a29bd52001-09-12 03:03:31 +00004477 if (!fixfct(u) && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004478 /* fixfct should return TRUE if it modified the buffer. If
4479 FALSE, return a reference to the original buffer instead
4480 (to save space, not time) */
4481 Py_INCREF(self);
4482 Py_DECREF(u);
4483 return (PyObject*) self;
4484 }
4485 return (PyObject*) u;
4486}
4487
Tim Petersced69f82003-09-16 20:30:58 +00004488static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004489int fixupper(PyUnicodeObject *self)
4490{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004491 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004492 Py_UNICODE *s = self->str;
4493 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00004494
Guido van Rossumd57fd912000-03-10 22:53:23 +00004495 while (len-- > 0) {
4496 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00004497
Guido van Rossumd57fd912000-03-10 22:53:23 +00004498 ch = Py_UNICODE_TOUPPER(*s);
4499 if (ch != *s) {
4500 status = 1;
4501 *s = ch;
4502 }
4503 s++;
4504 }
4505
4506 return status;
4507}
4508
Tim Petersced69f82003-09-16 20:30:58 +00004509static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004510int fixlower(PyUnicodeObject *self)
4511{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004512 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004513 Py_UNICODE *s = self->str;
4514 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00004515
Guido van Rossumd57fd912000-03-10 22:53:23 +00004516 while (len-- > 0) {
4517 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00004518
Guido van Rossumd57fd912000-03-10 22:53:23 +00004519 ch = Py_UNICODE_TOLOWER(*s);
4520 if (ch != *s) {
4521 status = 1;
4522 *s = ch;
4523 }
4524 s++;
4525 }
4526
4527 return status;
4528}
4529
Tim Petersced69f82003-09-16 20:30:58 +00004530static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004531int fixswapcase(PyUnicodeObject *self)
4532{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004533 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004534 Py_UNICODE *s = self->str;
4535 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00004536
Guido van Rossumd57fd912000-03-10 22:53:23 +00004537 while (len-- > 0) {
4538 if (Py_UNICODE_ISUPPER(*s)) {
4539 *s = Py_UNICODE_TOLOWER(*s);
4540 status = 1;
4541 } else if (Py_UNICODE_ISLOWER(*s)) {
4542 *s = Py_UNICODE_TOUPPER(*s);
4543 status = 1;
4544 }
4545 s++;
4546 }
4547
4548 return status;
4549}
4550
Tim Petersced69f82003-09-16 20:30:58 +00004551static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004552int fixcapitalize(PyUnicodeObject *self)
4553{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004554 Py_ssize_t len = self->length;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00004555 Py_UNICODE *s = self->str;
4556 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00004557
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00004558 if (len == 0)
4559 return 0;
4560 if (Py_UNICODE_ISLOWER(*s)) {
4561 *s = Py_UNICODE_TOUPPER(*s);
4562 status = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004563 }
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00004564 s++;
4565 while (--len > 0) {
4566 if (Py_UNICODE_ISUPPER(*s)) {
4567 *s = Py_UNICODE_TOLOWER(*s);
4568 status = 1;
4569 }
4570 s++;
4571 }
4572 return status;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004573}
4574
4575static
4576int fixtitle(PyUnicodeObject *self)
4577{
4578 register Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
4579 register Py_UNICODE *e;
4580 int previous_is_cased;
4581
4582 /* Shortcut for single character strings */
4583 if (PyUnicode_GET_SIZE(self) == 1) {
4584 Py_UNICODE ch = Py_UNICODE_TOTITLE(*p);
4585 if (*p != ch) {
4586 *p = ch;
4587 return 1;
4588 }
4589 else
4590 return 0;
4591 }
Tim Petersced69f82003-09-16 20:30:58 +00004592
Guido van Rossumd57fd912000-03-10 22:53:23 +00004593 e = p + PyUnicode_GET_SIZE(self);
4594 previous_is_cased = 0;
4595 for (; p < e; p++) {
4596 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00004597
Guido van Rossumd57fd912000-03-10 22:53:23 +00004598 if (previous_is_cased)
4599 *p = Py_UNICODE_TOLOWER(ch);
4600 else
4601 *p = Py_UNICODE_TOTITLE(ch);
Tim Petersced69f82003-09-16 20:30:58 +00004602
4603 if (Py_UNICODE_ISLOWER(ch) ||
4604 Py_UNICODE_ISUPPER(ch) ||
Guido van Rossumd57fd912000-03-10 22:53:23 +00004605 Py_UNICODE_ISTITLE(ch))
4606 previous_is_cased = 1;
4607 else
4608 previous_is_cased = 0;
4609 }
4610 return 1;
4611}
4612
Tim Peters8ce9f162004-08-27 01:49:32 +00004613PyObject *
4614PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004615{
Tim Peters8ce9f162004-08-27 01:49:32 +00004616 PyObject *internal_separator = NULL;
Skip Montanaro6543b452004-09-16 03:28:13 +00004617 const Py_UNICODE blank = ' ';
4618 const Py_UNICODE *sep = &blank;
Thomas Wouters477c8d52006-05-27 19:21:47 +00004619 Py_ssize_t seplen = 1;
Tim Peters05eba1f2004-08-27 21:32:02 +00004620 PyUnicodeObject *res = NULL; /* the result */
Thomas Wouters477c8d52006-05-27 19:21:47 +00004621 Py_ssize_t res_alloc = 100; /* # allocated bytes for string in res */
4622 Py_ssize_t res_used; /* # used bytes */
Tim Peters05eba1f2004-08-27 21:32:02 +00004623 Py_UNICODE *res_p; /* pointer to free byte in res's string area */
4624 PyObject *fseq; /* PySequence_Fast(seq) */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004625 Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */
Tim Peters8ce9f162004-08-27 01:49:32 +00004626 PyObject *item;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004627 Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004628
Tim Peters05eba1f2004-08-27 21:32:02 +00004629 fseq = PySequence_Fast(seq, "");
4630 if (fseq == NULL) {
Tim Peters05eba1f2004-08-27 21:32:02 +00004631 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +00004632 }
4633
Tim Peters91879ab2004-08-27 22:35:44 +00004634 /* Grrrr. A codec may be invoked to convert str objects to
4635 * Unicode, and so it's possible to call back into Python code
4636 * during PyUnicode_FromObject(), and so it's possible for a sick
4637 * codec to change the size of fseq (if seq is a list). Therefore
4638 * we have to keep refetching the size -- can't assume seqlen
4639 * is invariant.
4640 */
Tim Peters05eba1f2004-08-27 21:32:02 +00004641 seqlen = PySequence_Fast_GET_SIZE(fseq);
4642 /* If empty sequence, return u"". */
4643 if (seqlen == 0) {
4644 res = _PyUnicode_New(0); /* empty sequence; return u"" */
4645 goto Done;
4646 }
4647 /* If singleton sequence with an exact Unicode, return that. */
4648 if (seqlen == 1) {
4649 item = PySequence_Fast_GET_ITEM(fseq, 0);
4650 if (PyUnicode_CheckExact(item)) {
4651 Py_INCREF(item);
4652 res = (PyUnicodeObject *)item;
4653 goto Done;
4654 }
Tim Peters8ce9f162004-08-27 01:49:32 +00004655 }
4656
Tim Peters05eba1f2004-08-27 21:32:02 +00004657 /* At least two items to join, or one that isn't exact Unicode. */
4658 if (seqlen > 1) {
4659 /* Set up sep and seplen -- they're needed. */
4660 if (separator == NULL) {
4661 sep = &blank;
4662 seplen = 1;
4663 }
4664 else {
4665 internal_separator = PyUnicode_FromObject(separator);
4666 if (internal_separator == NULL)
4667 goto onError;
4668 sep = PyUnicode_AS_UNICODE(internal_separator);
4669 seplen = PyUnicode_GET_SIZE(internal_separator);
Tim Peters91879ab2004-08-27 22:35:44 +00004670 /* In case PyUnicode_FromObject() mutated seq. */
4671 seqlen = PySequence_Fast_GET_SIZE(fseq);
Tim Peters05eba1f2004-08-27 21:32:02 +00004672 }
4673 }
4674
4675 /* Get space. */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004676 res = _PyUnicode_New(res_alloc);
Tim Peters05eba1f2004-08-27 21:32:02 +00004677 if (res == NULL)
Tim Peters8ce9f162004-08-27 01:49:32 +00004678 goto onError;
Tim Peters05eba1f2004-08-27 21:32:02 +00004679 res_p = PyUnicode_AS_UNICODE(res);
4680 res_used = 0;
Tim Peters8ce9f162004-08-27 01:49:32 +00004681
Tim Peters05eba1f2004-08-27 21:32:02 +00004682 for (i = 0; i < seqlen; ++i) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00004683 Py_ssize_t itemlen;
4684 Py_ssize_t new_res_used;
Tim Peters05eba1f2004-08-27 21:32:02 +00004685
4686 item = PySequence_Fast_GET_ITEM(fseq, i);
4687 /* Convert item to Unicode. */
4688 if (! PyUnicode_Check(item) && ! PyString_Check(item)) {
4689 PyErr_Format(PyExc_TypeError,
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004690 "sequence item %zd: expected string or Unicode,"
Tim Peters05eba1f2004-08-27 21:32:02 +00004691 " %.80s found",
4692 i, item->ob_type->tp_name);
Tim Peters2cfe3682001-05-05 05:36:48 +00004693 goto onError;
Tim Peters8ce9f162004-08-27 01:49:32 +00004694 }
Tim Peters05eba1f2004-08-27 21:32:02 +00004695 item = PyUnicode_FromObject(item);
4696 if (item == NULL)
4697 goto onError;
4698 /* We own a reference to item from here on. */
Tim Petersced69f82003-09-16 20:30:58 +00004699
Tim Peters91879ab2004-08-27 22:35:44 +00004700 /* In case PyUnicode_FromObject() mutated seq. */
4701 seqlen = PySequence_Fast_GET_SIZE(fseq);
4702
Tim Peters8ce9f162004-08-27 01:49:32 +00004703 /* Make sure we have enough space for the separator and the item. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004704 itemlen = PyUnicode_GET_SIZE(item);
Tim Peters05eba1f2004-08-27 21:32:02 +00004705 new_res_used = res_used + itemlen;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004706 if (new_res_used < 0)
Tim Peters8ce9f162004-08-27 01:49:32 +00004707 goto Overflow;
Tim Peters05eba1f2004-08-27 21:32:02 +00004708 if (i < seqlen - 1) {
4709 new_res_used += seplen;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004710 if (new_res_used < 0)
Tim Peters05eba1f2004-08-27 21:32:02 +00004711 goto Overflow;
4712 }
4713 if (new_res_used > res_alloc) {
4714 /* double allocated size until it's big enough */
Tim Peters8ce9f162004-08-27 01:49:32 +00004715 do {
Tim Peters05eba1f2004-08-27 21:32:02 +00004716 res_alloc += res_alloc;
Thomas Wouters477c8d52006-05-27 19:21:47 +00004717 if (res_alloc <= 0)
Tim Peters8ce9f162004-08-27 01:49:32 +00004718 goto Overflow;
Tim Peters05eba1f2004-08-27 21:32:02 +00004719 } while (new_res_used > res_alloc);
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004720 if (_PyUnicode_Resize(&res, res_alloc) < 0) {
Marc-André Lemburg3508e302001-09-20 17:22:58 +00004721 Py_DECREF(item);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004722 goto onError;
Marc-André Lemburg3508e302001-09-20 17:22:58 +00004723 }
Tim Peters05eba1f2004-08-27 21:32:02 +00004724 res_p = PyUnicode_AS_UNICODE(res) + res_used;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004725 }
Tim Peters05eba1f2004-08-27 21:32:02 +00004726
4727 /* Copy item, and maybe the separator. */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004728 Py_UNICODE_COPY(res_p, PyUnicode_AS_UNICODE(item), itemlen);
Tim Peters05eba1f2004-08-27 21:32:02 +00004729 res_p += itemlen;
4730 if (i < seqlen - 1) {
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004731 Py_UNICODE_COPY(res_p, sep, seplen);
Tim Peters05eba1f2004-08-27 21:32:02 +00004732 res_p += seplen;
4733 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004734 Py_DECREF(item);
Tim Peters05eba1f2004-08-27 21:32:02 +00004735 res_used = new_res_used;
4736 }
Tim Peters8ce9f162004-08-27 01:49:32 +00004737
Tim Peters05eba1f2004-08-27 21:32:02 +00004738 /* Shrink res to match the used area; this probably can't fail,
4739 * but it's cheap to check.
4740 */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004741 if (_PyUnicode_Resize(&res, res_used) < 0)
Tim Peters8ce9f162004-08-27 01:49:32 +00004742 goto onError;
4743
4744 Done:
4745 Py_XDECREF(internal_separator);
Tim Peters05eba1f2004-08-27 21:32:02 +00004746 Py_DECREF(fseq);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004747 return (PyObject *)res;
4748
Tim Peters8ce9f162004-08-27 01:49:32 +00004749 Overflow:
4750 PyErr_SetString(PyExc_OverflowError,
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004751 "join() result is too long for a Python string");
Tim Peters8ce9f162004-08-27 01:49:32 +00004752 Py_DECREF(item);
4753 /* fall through */
4754
Guido van Rossumd57fd912000-03-10 22:53:23 +00004755 onError:
Tim Peters8ce9f162004-08-27 01:49:32 +00004756 Py_XDECREF(internal_separator);
Tim Peters05eba1f2004-08-27 21:32:02 +00004757 Py_DECREF(fseq);
Tim Peters8ce9f162004-08-27 01:49:32 +00004758 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004759 return NULL;
4760}
4761
Tim Petersced69f82003-09-16 20:30:58 +00004762static
4763PyUnicodeObject *pad(PyUnicodeObject *self,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004764 Py_ssize_t left,
4765 Py_ssize_t right,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004766 Py_UNICODE fill)
4767{
4768 PyUnicodeObject *u;
4769
4770 if (left < 0)
4771 left = 0;
4772 if (right < 0)
4773 right = 0;
4774
Tim Peters7a29bd52001-09-12 03:03:31 +00004775 if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004776 Py_INCREF(self);
4777 return self;
4778 }
4779
4780 u = _PyUnicode_New(left + self->length + right);
4781 if (u) {
4782 if (left)
4783 Py_UNICODE_FILL(u->str, fill, left);
4784 Py_UNICODE_COPY(u->str + left, self->str, self->length);
4785 if (right)
4786 Py_UNICODE_FILL(u->str + left + self->length, fill, right);
4787 }
4788
4789 return u;
4790}
4791
4792#define SPLIT_APPEND(data, left, right) \
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004793 str = PyUnicode_FromUnicode((data) + (left), (right) - (left)); \
Guido van Rossumd57fd912000-03-10 22:53:23 +00004794 if (!str) \
4795 goto onError; \
4796 if (PyList_Append(list, str)) { \
4797 Py_DECREF(str); \
4798 goto onError; \
4799 } \
4800 else \
4801 Py_DECREF(str);
4802
4803static
4804PyObject *split_whitespace(PyUnicodeObject *self,
4805 PyObject *list,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004806 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004807{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004808 register Py_ssize_t i;
4809 register Py_ssize_t j;
4810 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004811 PyObject *str;
4812
4813 for (i = j = 0; i < len; ) {
4814 /* find a token */
4815 while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
4816 i++;
4817 j = i;
4818 while (i < len && !Py_UNICODE_ISSPACE(self->str[i]))
4819 i++;
4820 if (j < i) {
4821 if (maxcount-- <= 0)
4822 break;
4823 SPLIT_APPEND(self->str, j, i);
4824 while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
4825 i++;
4826 j = i;
4827 }
4828 }
4829 if (j < len) {
4830 SPLIT_APPEND(self->str, j, len);
4831 }
4832 return list;
4833
4834 onError:
4835 Py_DECREF(list);
4836 return NULL;
4837}
4838
4839PyObject *PyUnicode_Splitlines(PyObject *string,
Guido van Rossum86662912000-04-11 15:38:46 +00004840 int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004841{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004842 register Py_ssize_t i;
4843 register Py_ssize_t j;
4844 Py_ssize_t len;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004845 PyObject *list;
4846 PyObject *str;
4847 Py_UNICODE *data;
4848
4849 string = PyUnicode_FromObject(string);
4850 if (string == NULL)
4851 return NULL;
4852 data = PyUnicode_AS_UNICODE(string);
4853 len = PyUnicode_GET_SIZE(string);
4854
Guido van Rossumd57fd912000-03-10 22:53:23 +00004855 list = PyList_New(0);
4856 if (!list)
4857 goto onError;
4858
4859 for (i = j = 0; i < len; ) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00004860 Py_ssize_t eol;
Tim Petersced69f82003-09-16 20:30:58 +00004861
Guido van Rossumd57fd912000-03-10 22:53:23 +00004862 /* Find a line and append it */
Thomas Wouters477c8d52006-05-27 19:21:47 +00004863 while (i < len && !BLOOM_LINEBREAK(data[i]))
Guido van Rossumd57fd912000-03-10 22:53:23 +00004864 i++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004865
4866 /* Skip the line break reading CRLF as one line break */
Guido van Rossum86662912000-04-11 15:38:46 +00004867 eol = i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004868 if (i < len) {
4869 if (data[i] == '\r' && i + 1 < len &&
4870 data[i+1] == '\n')
4871 i += 2;
4872 else
4873 i++;
Guido van Rossum86662912000-04-11 15:38:46 +00004874 if (keepends)
4875 eol = i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004876 }
Guido van Rossum86662912000-04-11 15:38:46 +00004877 SPLIT_APPEND(data, j, eol);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004878 j = i;
4879 }
4880 if (j < len) {
4881 SPLIT_APPEND(data, j, len);
4882 }
4883
4884 Py_DECREF(string);
4885 return list;
4886
4887 onError:
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00004888 Py_XDECREF(list);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004889 Py_DECREF(string);
4890 return NULL;
4891}
4892
Tim Petersced69f82003-09-16 20:30:58 +00004893static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004894PyObject *split_char(PyUnicodeObject *self,
4895 PyObject *list,
4896 Py_UNICODE ch,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004897 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004898{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004899 register Py_ssize_t i;
4900 register Py_ssize_t j;
4901 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004902 PyObject *str;
4903
4904 for (i = j = 0; i < len; ) {
4905 if (self->str[i] == ch) {
4906 if (maxcount-- <= 0)
4907 break;
4908 SPLIT_APPEND(self->str, j, i);
4909 i = j = i + 1;
4910 } else
4911 i++;
4912 }
4913 if (j <= len) {
4914 SPLIT_APPEND(self->str, j, len);
4915 }
4916 return list;
4917
4918 onError:
4919 Py_DECREF(list);
4920 return NULL;
4921}
4922
Tim Petersced69f82003-09-16 20:30:58 +00004923static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004924PyObject *split_substring(PyUnicodeObject *self,
4925 PyObject *list,
4926 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004927 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004928{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004929 register Py_ssize_t i;
4930 register Py_ssize_t j;
4931 Py_ssize_t len = self->length;
4932 Py_ssize_t sublen = substring->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004933 PyObject *str;
4934
Guido van Rossumcda4f9a2000-12-19 02:23:19 +00004935 for (i = j = 0; i <= len - sublen; ) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004936 if (Py_UNICODE_MATCH(self, i, substring)) {
4937 if (maxcount-- <= 0)
4938 break;
4939 SPLIT_APPEND(self->str, j, i);
4940 i = j = i + sublen;
4941 } else
4942 i++;
4943 }
4944 if (j <= len) {
4945 SPLIT_APPEND(self->str, j, len);
4946 }
4947 return list;
4948
4949 onError:
4950 Py_DECREF(list);
4951 return NULL;
4952}
4953
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004954static
4955PyObject *rsplit_whitespace(PyUnicodeObject *self,
4956 PyObject *list,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004957 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004958{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004959 register Py_ssize_t i;
4960 register Py_ssize_t j;
4961 Py_ssize_t len = self->length;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004962 PyObject *str;
4963
4964 for (i = j = len - 1; i >= 0; ) {
4965 /* find a token */
4966 while (i >= 0 && Py_UNICODE_ISSPACE(self->str[i]))
4967 i--;
4968 j = i;
4969 while (i >= 0 && !Py_UNICODE_ISSPACE(self->str[i]))
4970 i--;
4971 if (j > i) {
4972 if (maxcount-- <= 0)
4973 break;
Thomas Wouters477c8d52006-05-27 19:21:47 +00004974 SPLIT_APPEND(self->str, i + 1, j + 1);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004975 while (i >= 0 && Py_UNICODE_ISSPACE(self->str[i]))
4976 i--;
4977 j = i;
4978 }
4979 }
4980 if (j >= 0) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00004981 SPLIT_APPEND(self->str, 0, j + 1);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004982 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00004983 if (PyList_Reverse(list) < 0)
4984 goto onError;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004985 return list;
4986
4987 onError:
4988 Py_DECREF(list);
4989 return NULL;
4990}
4991
4992static
4993PyObject *rsplit_char(PyUnicodeObject *self,
4994 PyObject *list,
4995 Py_UNICODE ch,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004996 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004997{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004998 register Py_ssize_t i;
4999 register Py_ssize_t j;
5000 Py_ssize_t len = self->length;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005001 PyObject *str;
5002
5003 for (i = j = len - 1; i >= 0; ) {
5004 if (self->str[i] == ch) {
5005 if (maxcount-- <= 0)
5006 break;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005007 SPLIT_APPEND(self->str, i + 1, j + 1);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005008 j = i = i - 1;
5009 } else
5010 i--;
5011 }
Hye-Shik Chang7fc4cf52003-12-23 09:10:16 +00005012 if (j >= -1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00005013 SPLIT_APPEND(self->str, 0, j + 1);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005014 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00005015 if (PyList_Reverse(list) < 0)
5016 goto onError;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005017 return list;
5018
5019 onError:
5020 Py_DECREF(list);
5021 return NULL;
5022}
5023
5024static
5025PyObject *rsplit_substring(PyUnicodeObject *self,
5026 PyObject *list,
5027 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005028 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005029{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005030 register Py_ssize_t i;
5031 register Py_ssize_t j;
5032 Py_ssize_t len = self->length;
5033 Py_ssize_t sublen = substring->length;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005034 PyObject *str;
5035
5036 for (i = len - sublen, j = len; i >= 0; ) {
5037 if (Py_UNICODE_MATCH(self, i, substring)) {
5038 if (maxcount-- <= 0)
5039 break;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005040 SPLIT_APPEND(self->str, i + sublen, j);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005041 j = i;
5042 i -= sublen;
5043 } else
5044 i--;
5045 }
5046 if (j >= 0) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00005047 SPLIT_APPEND(self->str, 0, j);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005048 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00005049 if (PyList_Reverse(list) < 0)
5050 goto onError;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005051 return list;
5052
5053 onError:
5054 Py_DECREF(list);
5055 return NULL;
5056}
5057
Guido van Rossumd57fd912000-03-10 22:53:23 +00005058#undef SPLIT_APPEND
5059
5060static
5061PyObject *split(PyUnicodeObject *self,
5062 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005063 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005064{
5065 PyObject *list;
5066
5067 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005068 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005069
5070 list = PyList_New(0);
5071 if (!list)
5072 return NULL;
5073
5074 if (substring == NULL)
5075 return split_whitespace(self,list,maxcount);
5076
5077 else if (substring->length == 1)
5078 return split_char(self,list,substring->str[0],maxcount);
5079
5080 else if (substring->length == 0) {
5081 Py_DECREF(list);
5082 PyErr_SetString(PyExc_ValueError, "empty separator");
5083 return NULL;
5084 }
5085 else
5086 return split_substring(self,list,substring,maxcount);
5087}
5088
Tim Petersced69f82003-09-16 20:30:58 +00005089static
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005090PyObject *rsplit(PyUnicodeObject *self,
5091 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005092 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005093{
5094 PyObject *list;
5095
5096 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005097 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005098
5099 list = PyList_New(0);
5100 if (!list)
5101 return NULL;
5102
5103 if (substring == NULL)
5104 return rsplit_whitespace(self,list,maxcount);
5105
5106 else if (substring->length == 1)
5107 return rsplit_char(self,list,substring->str[0],maxcount);
5108
5109 else if (substring->length == 0) {
5110 Py_DECREF(list);
5111 PyErr_SetString(PyExc_ValueError, "empty separator");
5112 return NULL;
5113 }
5114 else
5115 return rsplit_substring(self,list,substring,maxcount);
5116}
5117
5118static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005119PyObject *replace(PyUnicodeObject *self,
5120 PyUnicodeObject *str1,
5121 PyUnicodeObject *str2,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005122 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005123{
5124 PyUnicodeObject *u;
5125
5126 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005127 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005128
Thomas Wouters477c8d52006-05-27 19:21:47 +00005129 if (str1->length == str2->length) {
5130 /* same length */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005131 Py_ssize_t i;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005132 if (str1->length == 1) {
5133 /* replace characters */
5134 Py_UNICODE u1, u2;
5135 if (!findchar(self->str, self->length, str1->str[0]))
5136 goto nothing;
5137 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
5138 if (!u)
5139 return NULL;
5140 Py_UNICODE_COPY(u->str, self->str, self->length);
5141 u1 = str1->str[0];
5142 u2 = str2->str[0];
5143 for (i = 0; i < u->length; i++)
5144 if (u->str[i] == u1) {
5145 if (--maxcount < 0)
5146 break;
5147 u->str[i] = u2;
5148 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005149 } else {
Thomas Wouters477c8d52006-05-27 19:21:47 +00005150 i = fastsearch(
5151 self->str, self->length, str1->str, str1->length, FAST_SEARCH
Guido van Rossumd57fd912000-03-10 22:53:23 +00005152 );
Thomas Wouters477c8d52006-05-27 19:21:47 +00005153 if (i < 0)
5154 goto nothing;
5155 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
5156 if (!u)
5157 return NULL;
5158 Py_UNICODE_COPY(u->str, self->str, self->length);
5159 while (i <= self->length - str1->length)
5160 if (Py_UNICODE_MATCH(self, i, str1)) {
5161 if (--maxcount < 0)
5162 break;
5163 Py_UNICODE_COPY(u->str+i, str2->str, str2->length);
5164 i += str1->length;
5165 } else
5166 i++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005167 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005168 } else {
Thomas Wouters477c8d52006-05-27 19:21:47 +00005169
5170 Py_ssize_t n, i, j, e;
5171 Py_ssize_t product, new_size, delta;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005172 Py_UNICODE *p;
5173
5174 /* replace strings */
Thomas Wouters477c8d52006-05-27 19:21:47 +00005175 n = stringlib_count(self->str, self->length, str1->str, str1->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005176 if (n > maxcount)
5177 n = maxcount;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005178 if (n == 0)
5179 goto nothing;
5180 /* new_size = self->length + n * (str2->length - str1->length)); */
5181 delta = (str2->length - str1->length);
5182 if (delta == 0) {
5183 new_size = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005184 } else {
Thomas Wouters477c8d52006-05-27 19:21:47 +00005185 product = n * (str2->length - str1->length);
5186 if ((product / (str2->length - str1->length)) != n) {
5187 PyErr_SetString(PyExc_OverflowError,
5188 "replace string is too long");
5189 return NULL;
5190 }
5191 new_size = self->length + product;
5192 if (new_size < 0) {
5193 PyErr_SetString(PyExc_OverflowError,
5194 "replace string is too long");
5195 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005196 }
5197 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00005198 u = _PyUnicode_New(new_size);
5199 if (!u)
5200 return NULL;
5201 i = 0;
5202 p = u->str;
5203 e = self->length - str1->length;
5204 if (str1->length > 0) {
5205 while (n-- > 0) {
5206 /* look for next match */
5207 j = i;
5208 while (j <= e) {
5209 if (Py_UNICODE_MATCH(self, j, str1))
5210 break;
5211 j++;
5212 }
5213 if (j > i) {
5214 if (j > e)
5215 break;
5216 /* copy unchanged part [i:j] */
5217 Py_UNICODE_COPY(p, self->str+i, j-i);
5218 p += j - i;
5219 }
5220 /* copy substitution string */
5221 if (str2->length > 0) {
5222 Py_UNICODE_COPY(p, str2->str, str2->length);
5223 p += str2->length;
5224 }
5225 i = j + str1->length;
5226 }
5227 if (i < self->length)
5228 /* copy tail [i:] */
5229 Py_UNICODE_COPY(p, self->str+i, self->length-i);
5230 } else {
5231 /* interleave */
5232 while (n > 0) {
5233 Py_UNICODE_COPY(p, str2->str, str2->length);
5234 p += str2->length;
5235 if (--n <= 0)
5236 break;
5237 *p++ = self->str[i++];
5238 }
5239 Py_UNICODE_COPY(p, self->str+i, self->length-i);
5240 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005241 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005242 return (PyObject *) u;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005243
5244nothing:
5245 /* nothing to replace; return original string (when possible) */
5246 if (PyUnicode_CheckExact(self)) {
5247 Py_INCREF(self);
5248 return (PyObject *) self;
5249 }
5250 return PyUnicode_FromUnicode(self->str, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005251}
5252
5253/* --- Unicode Object Methods --------------------------------------------- */
5254
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005255PyDoc_STRVAR(title__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005256"S.title() -> unicode\n\
5257\n\
5258Return a titlecased version of S, i.e. words start with title case\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005259characters, all remaining cased characters have lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005260
5261static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005262unicode_title(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005263{
Guido van Rossumd57fd912000-03-10 22:53:23 +00005264 return fixup(self, fixtitle);
5265}
5266
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005267PyDoc_STRVAR(capitalize__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005268"S.capitalize() -> unicode\n\
5269\n\
5270Return a capitalized version of S, i.e. make the first character\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005271have upper case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005272
5273static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005274unicode_capitalize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005275{
Guido van Rossumd57fd912000-03-10 22:53:23 +00005276 return fixup(self, fixcapitalize);
5277}
5278
5279#if 0
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005280PyDoc_STRVAR(capwords__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005281"S.capwords() -> unicode\n\
5282\n\
5283Apply .capitalize() to all words in S and return the result with\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005284normalized whitespace (all whitespace strings are replaced by ' ').");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005285
5286static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005287unicode_capwords(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005288{
5289 PyObject *list;
5290 PyObject *item;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005291 Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005292
Guido van Rossumd57fd912000-03-10 22:53:23 +00005293 /* Split into words */
5294 list = split(self, NULL, -1);
5295 if (!list)
5296 return NULL;
5297
5298 /* Capitalize each word */
5299 for (i = 0; i < PyList_GET_SIZE(list); i++) {
5300 item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
5301 fixcapitalize);
5302 if (item == NULL)
5303 goto onError;
5304 Py_DECREF(PyList_GET_ITEM(list, i));
5305 PyList_SET_ITEM(list, i, item);
5306 }
5307
5308 /* Join the words to form a new string */
5309 item = PyUnicode_Join(NULL, list);
5310
5311onError:
5312 Py_DECREF(list);
5313 return (PyObject *)item;
5314}
5315#endif
5316
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00005317/* Argument converter. Coerces to a single unicode character */
5318
5319static int
5320convert_uc(PyObject *obj, void *addr)
5321{
5322 Py_UNICODE *fillcharloc = (Py_UNICODE *)addr;
5323 PyObject *uniobj;
5324 Py_UNICODE *unistr;
5325
5326 uniobj = PyUnicode_FromObject(obj);
5327 if (uniobj == NULL) {
5328 PyErr_SetString(PyExc_TypeError,
5329 "The fill character cannot be converted to Unicode");
5330 return 0;
5331 }
5332 if (PyUnicode_GET_SIZE(uniobj) != 1) {
5333 PyErr_SetString(PyExc_TypeError,
5334 "The fill character must be exactly one character long");
5335 Py_DECREF(uniobj);
5336 return 0;
5337 }
5338 unistr = PyUnicode_AS_UNICODE(uniobj);
5339 *fillcharloc = unistr[0];
5340 Py_DECREF(uniobj);
5341 return 1;
5342}
5343
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005344PyDoc_STRVAR(center__doc__,
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00005345"S.center(width[, fillchar]) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005346\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00005347Return S centered in a Unicode string of length width. Padding is\n\
5348done using the specified fill character (default is a space)");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005349
5350static PyObject *
5351unicode_center(PyUnicodeObject *self, PyObject *args)
5352{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005353 Py_ssize_t marg, left;
5354 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00005355 Py_UNICODE fillchar = ' ';
Guido van Rossumd57fd912000-03-10 22:53:23 +00005356
Thomas Woutersde017742006-02-16 19:34:37 +00005357 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005358 return NULL;
5359
Tim Peters7a29bd52001-09-12 03:03:31 +00005360 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005361 Py_INCREF(self);
5362 return (PyObject*) self;
5363 }
5364
5365 marg = width - self->length;
5366 left = marg / 2 + (marg & width & 1);
5367
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00005368 return (PyObject*) pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005369}
5370
Marc-André Lemburge5034372000-08-08 08:04:29 +00005371#if 0
5372
5373/* This code should go into some future Unicode collation support
5374 module. The basic comparison should compare ordinals on a naive
Trent Mick20abf572000-08-12 22:14:34 +00005375 basis (this is what Java does and thus JPython too). */
Marc-André Lemburge5034372000-08-08 08:04:29 +00005376
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005377/* speedy UTF-16 code point order comparison */
5378/* gleaned from: */
5379/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
5380
Marc-André Lemburge12896e2000-07-07 17:51:08 +00005381static short utf16Fixup[32] =
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005382{
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005383 0, 0, 0, 0, 0, 0, 0, 0,
Tim Petersced69f82003-09-16 20:30:58 +00005384 0, 0, 0, 0, 0, 0, 0, 0,
5385 0, 0, 0, 0, 0, 0, 0, 0,
Marc-André Lemburge12896e2000-07-07 17:51:08 +00005386 0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005387};
5388
Guido van Rossumd57fd912000-03-10 22:53:23 +00005389static int
5390unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
5391{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005392 Py_ssize_t len1, len2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005393
Guido van Rossumd57fd912000-03-10 22:53:23 +00005394 Py_UNICODE *s1 = str1->str;
5395 Py_UNICODE *s2 = str2->str;
5396
5397 len1 = str1->length;
5398 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00005399
Guido van Rossumd57fd912000-03-10 22:53:23 +00005400 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00005401 Py_UNICODE c1, c2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005402
5403 c1 = *s1++;
5404 c2 = *s2++;
Fredrik Lundh45714e92001-06-26 16:39:36 +00005405
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005406 if (c1 > (1<<11) * 26)
5407 c1 += utf16Fixup[c1>>11];
5408 if (c2 > (1<<11) * 26)
5409 c2 += utf16Fixup[c2>>11];
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005410 /* now c1 and c2 are in UTF-32-compatible order */
Fredrik Lundh45714e92001-06-26 16:39:36 +00005411
5412 if (c1 != c2)
5413 return (c1 < c2) ? -1 : 1;
Tim Petersced69f82003-09-16 20:30:58 +00005414
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005415 len1--; len2--;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005416 }
5417
5418 return (len1 < len2) ? -1 : (len1 != len2);
5419}
5420
Marc-André Lemburge5034372000-08-08 08:04:29 +00005421#else
5422
5423static int
5424unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
5425{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005426 register Py_ssize_t len1, len2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00005427
5428 Py_UNICODE *s1 = str1->str;
5429 Py_UNICODE *s2 = str2->str;
5430
5431 len1 = str1->length;
5432 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00005433
Marc-André Lemburge5034372000-08-08 08:04:29 +00005434 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00005435 Py_UNICODE c1, c2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00005436
Fredrik Lundh45714e92001-06-26 16:39:36 +00005437 c1 = *s1++;
5438 c2 = *s2++;
5439
5440 if (c1 != c2)
5441 return (c1 < c2) ? -1 : 1;
5442
Marc-André Lemburge5034372000-08-08 08:04:29 +00005443 len1--; len2--;
5444 }
5445
5446 return (len1 < len2) ? -1 : (len1 != len2);
5447}
5448
5449#endif
5450
Guido van Rossumd57fd912000-03-10 22:53:23 +00005451int PyUnicode_Compare(PyObject *left,
5452 PyObject *right)
5453{
Guido van Rossum09dc34f2007-05-04 04:17:33 +00005454 if (PyUnicode_Check(left) && PyUnicode_Check(right))
5455 return unicode_compare((PyUnicodeObject *)left,
5456 (PyUnicodeObject *)right);
5457 if ((PyString_Check(left) && PyUnicode_Check(right)) ||
5458 (PyUnicode_Check(left) && PyString_Check(right))) {
5459 if (PyUnicode_Check(left))
5460 left = _PyUnicode_AsDefaultEncodedString(left, NULL);
5461 if (PyUnicode_Check(right))
5462 right = _PyUnicode_AsDefaultEncodedString(right, NULL);
5463 assert(PyString_Check(left));
5464 assert(PyString_Check(right));
5465 return PyObject_Compare(left, right);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005466 }
Guido van Rossum09dc34f2007-05-04 04:17:33 +00005467 PyErr_Format(PyExc_TypeError,
5468 "Can't compare %.100s and %.100s",
5469 left->ob_type->tp_name,
5470 right->ob_type->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005471 return -1;
5472}
5473
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00005474PyObject *PyUnicode_RichCompare(PyObject *left,
5475 PyObject *right,
5476 int op)
5477{
5478 int result;
5479
5480 result = PyUnicode_Compare(left, right);
5481 if (result == -1 && PyErr_Occurred())
5482 goto onError;
5483
5484 /* Convert the return value to a Boolean */
5485 switch (op) {
5486 case Py_EQ:
5487 result = (result == 0);
5488 break;
5489 case Py_NE:
5490 result = (result != 0);
5491 break;
5492 case Py_LE:
5493 result = (result <= 0);
5494 break;
5495 case Py_GE:
5496 result = (result >= 0);
5497 break;
5498 case Py_LT:
5499 result = (result == -1);
5500 break;
5501 case Py_GT:
5502 result = (result == 1);
5503 break;
5504 }
5505 return PyBool_FromLong(result);
5506
5507 onError:
5508
5509 /* Standard case
5510
5511 Type errors mean that PyUnicode_FromObject() could not convert
5512 one of the arguments (usually the right hand side) to Unicode,
5513 ie. we can't handle the comparison request. However, it is
5514 possible that the other object knows a comparison method, which
5515 is why we return Py_NotImplemented to give the other object a
5516 chance.
5517
5518 */
5519 if (PyErr_ExceptionMatches(PyExc_TypeError)) {
5520 PyErr_Clear();
5521 Py_INCREF(Py_NotImplemented);
5522 return Py_NotImplemented;
5523 }
5524 if (op != Py_EQ && op != Py_NE)
5525 return NULL;
5526
5527 /* Equality comparison.
5528
5529 This is a special case: we silence any PyExc_UnicodeDecodeError
5530 and instead turn it into a PyErr_UnicodeWarning.
5531
5532 */
5533 if (!PyErr_ExceptionMatches(PyExc_UnicodeDecodeError))
5534 return NULL;
5535 PyErr_Clear();
5536 if (PyErr_Warn(PyExc_UnicodeWarning,
5537 (op == Py_EQ) ?
5538 "Unicode equal comparison "
5539 "failed to convert both arguments to Unicode - "
5540 "interpreting them as being unequal" :
5541 "Unicode unequal comparison "
5542 "failed to convert both arguments to Unicode - "
5543 "interpreting them as being unequal"
5544 ) < 0)
5545 return NULL;
5546 result = (op == Py_NE);
5547 return PyBool_FromLong(result);
5548}
5549
Guido van Rossum403d68b2000-03-13 15:55:09 +00005550int PyUnicode_Contains(PyObject *container,
5551 PyObject *element)
5552{
Thomas Wouters477c8d52006-05-27 19:21:47 +00005553 PyObject *str, *sub;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005554 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00005555
5556 /* Coerce the two arguments */
Thomas Wouters477c8d52006-05-27 19:21:47 +00005557 sub = PyUnicode_FromObject(element);
5558 if (!sub) {
Marc-André Lemburg7c014682000-06-28 08:11:47 +00005559 PyErr_SetString(PyExc_TypeError,
Barry Warsaw817918c2002-08-06 16:58:21 +00005560 "'in <string>' requires string as left operand");
Thomas Wouters477c8d52006-05-27 19:21:47 +00005561 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +00005562 }
5563
Thomas Wouters477c8d52006-05-27 19:21:47 +00005564 str = PyUnicode_FromObject(container);
5565 if (!str) {
5566 Py_DECREF(sub);
5567 return -1;
5568 }
5569
5570 result = stringlib_contains_obj(str, sub);
5571
5572 Py_DECREF(str);
5573 Py_DECREF(sub);
5574
Guido van Rossum403d68b2000-03-13 15:55:09 +00005575 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00005576}
5577
Guido van Rossumd57fd912000-03-10 22:53:23 +00005578/* Concat to string or Unicode object giving a new Unicode object. */
5579
5580PyObject *PyUnicode_Concat(PyObject *left,
5581 PyObject *right)
5582{
5583 PyUnicodeObject *u = NULL, *v = NULL, *w;
5584
Guido van Rossum84d79dd2007-04-13 02:23:57 +00005585 if (PyBytes_Check(left) || PyBytes_Check(right))
5586 return PyBytes_Concat(left, right);
5587
Guido van Rossumd57fd912000-03-10 22:53:23 +00005588 /* Coerce the two arguments */
5589 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
5590 if (u == NULL)
5591 goto onError;
5592 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
5593 if (v == NULL)
5594 goto onError;
5595
5596 /* Shortcuts */
5597 if (v == unicode_empty) {
5598 Py_DECREF(v);
5599 return (PyObject *)u;
5600 }
5601 if (u == unicode_empty) {
5602 Py_DECREF(u);
5603 return (PyObject *)v;
5604 }
5605
5606 /* Concat the two Unicode strings */
5607 w = _PyUnicode_New(u->length + v->length);
5608 if (w == NULL)
5609 goto onError;
5610 Py_UNICODE_COPY(w->str, u->str, u->length);
5611 Py_UNICODE_COPY(w->str + u->length, v->str, v->length);
5612
5613 Py_DECREF(u);
5614 Py_DECREF(v);
5615 return (PyObject *)w;
5616
5617onError:
5618 Py_XDECREF(u);
5619 Py_XDECREF(v);
5620 return NULL;
5621}
5622
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005623PyDoc_STRVAR(count__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005624"S.count(sub[, start[, end]]) -> int\n\
5625\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00005626Return the number of non-overlapping occurrences of substring sub in\n\
5627Unicode string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005628interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005629
5630static PyObject *
5631unicode_count(PyUnicodeObject *self, PyObject *args)
5632{
5633 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005634 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005635 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005636 PyObject *result;
5637
Guido van Rossumb8872e62000-05-09 14:14:27 +00005638 if (!PyArg_ParseTuple(args, "O|O&O&:count", &substring,
5639 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005640 return NULL;
5641
5642 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Thomas Wouters477c8d52006-05-27 19:21:47 +00005643 (PyObject *)substring);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005644 if (substring == NULL)
5645 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +00005646
Thomas Wouters477c8d52006-05-27 19:21:47 +00005647 FIX_START_END(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005648
Thomas Wouters477c8d52006-05-27 19:21:47 +00005649 result = PyInt_FromSsize_t(
5650 stringlib_count(self->str + start, end - start,
5651 substring->str, substring->length)
5652 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00005653
5654 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00005655
Guido van Rossumd57fd912000-03-10 22:53:23 +00005656 return result;
5657}
5658
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005659PyDoc_STRVAR(encode__doc__,
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00005660"S.encode([encoding[,errors]]) -> string or unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005661\n\
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00005662Encodes S using the codec registered for encoding. encoding defaults\n\
5663to the default encoding. errors may be given to set a different error\n\
Fred Drakee4315f52000-05-09 19:53:39 +00005664handling scheme. Default is 'strict' meaning that encoding errors raise\n\
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005665a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
5666'xmlcharrefreplace' as well as any other name registered with\n\
5667codecs.register_error that can handle UnicodeEncodeErrors.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005668
5669static PyObject *
5670unicode_encode(PyUnicodeObject *self, PyObject *args)
5671{
5672 char *encoding = NULL;
5673 char *errors = NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00005674 PyObject *v;
5675
Guido van Rossumd57fd912000-03-10 22:53:23 +00005676 if (!PyArg_ParseTuple(args, "|ss:encode", &encoding, &errors))
5677 return NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00005678 v = PyUnicode_AsEncodedObject((PyObject *)self, encoding, errors);
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00005679 if (v == NULL)
5680 goto onError;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00005681 if (!PyBytes_Check(v)) {
Guido van Rossum4355a472007-05-04 05:00:04 +00005682 if (PyString_Check(v)) {
5683 /* Old codec, turn it into bytes */
5684 PyObject *b = PyBytes_FromObject(v);
5685 Py_DECREF(v);
5686 return b;
5687 }
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00005688 PyErr_Format(PyExc_TypeError,
Guido van Rossumf15a29f2007-05-04 00:41:39 +00005689 "encoder did not return a bytes object "
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00005690 "(type=%.400s)",
5691 v->ob_type->tp_name);
5692 Py_DECREF(v);
5693 return NULL;
5694 }
5695 return v;
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00005696
5697 onError:
5698 return NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00005699}
5700
5701PyDoc_STRVAR(decode__doc__,
5702"S.decode([encoding[,errors]]) -> string or unicode\n\
5703\n\
5704Decodes S using the codec registered for encoding. encoding defaults\n\
5705to the default encoding. errors may be given to set a different error\n\
5706handling scheme. Default is 'strict' meaning that encoding errors raise\n\
5707a UnicodeDecodeError. Other possible values are 'ignore' and 'replace'\n\
5708as well as any other name registerd with codecs.register_error that is\n\
5709able to handle UnicodeDecodeErrors.");
5710
5711static PyObject *
Marc-André Lemburg126b44c2004-07-10 12:04:20 +00005712unicode_decode(PyUnicodeObject *self, PyObject *args)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00005713{
5714 char *encoding = NULL;
5715 char *errors = NULL;
5716 PyObject *v;
5717
5718 if (!PyArg_ParseTuple(args, "|ss:decode", &encoding, &errors))
5719 return NULL;
5720 v = PyUnicode_AsDecodedObject((PyObject *)self, encoding, errors);
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00005721 if (v == NULL)
5722 goto onError;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00005723 if (!PyString_Check(v) && !PyUnicode_Check(v)) {
5724 PyErr_Format(PyExc_TypeError,
5725 "decoder did not return a string/unicode object "
5726 "(type=%.400s)",
5727 v->ob_type->tp_name);
5728 Py_DECREF(v);
5729 return NULL;
5730 }
5731 return v;
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00005732
5733 onError:
5734 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005735}
5736
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005737PyDoc_STRVAR(expandtabs__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005738"S.expandtabs([tabsize]) -> unicode\n\
5739\n\
5740Return a copy of S where all tab characters are expanded using spaces.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005741If tabsize is not given, a tab size of 8 characters is assumed.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005742
5743static PyObject*
5744unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
5745{
5746 Py_UNICODE *e;
5747 Py_UNICODE *p;
5748 Py_UNICODE *q;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005749 Py_ssize_t i, j;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005750 PyUnicodeObject *u;
5751 int tabsize = 8;
5752
5753 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
5754 return NULL;
5755
Thomas Wouters7e474022000-07-16 12:04:32 +00005756 /* First pass: determine size of output string */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005757 i = j = 0;
5758 e = self->str + self->length;
5759 for (p = self->str; p < e; p++)
5760 if (*p == '\t') {
5761 if (tabsize > 0)
5762 j += tabsize - (j % tabsize);
5763 }
5764 else {
5765 j++;
5766 if (*p == '\n' || *p == '\r') {
5767 i += j;
5768 j = 0;
5769 }
5770 }
5771
5772 /* Second pass: create output string and fill it */
5773 u = _PyUnicode_New(i + j);
5774 if (!u)
5775 return NULL;
5776
5777 j = 0;
5778 q = u->str;
5779
5780 for (p = self->str; p < e; p++)
5781 if (*p == '\t') {
5782 if (tabsize > 0) {
5783 i = tabsize - (j % tabsize);
5784 j += i;
5785 while (i--)
5786 *q++ = ' ';
5787 }
5788 }
5789 else {
5790 j++;
5791 *q++ = *p;
5792 if (*p == '\n' || *p == '\r')
5793 j = 0;
5794 }
5795
5796 return (PyObject*) u;
5797}
5798
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005799PyDoc_STRVAR(find__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005800"S.find(sub [,start [,end]]) -> int\n\
5801\n\
5802Return the lowest index in S where substring sub is found,\n\
5803such that sub is contained within s[start,end]. Optional\n\
5804arguments start and end are interpreted as in slice notation.\n\
5805\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005806Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005807
5808static PyObject *
5809unicode_find(PyUnicodeObject *self, PyObject *args)
5810{
Thomas Wouters477c8d52006-05-27 19:21:47 +00005811 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005812 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005813 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005814 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005815
Guido van Rossumb8872e62000-05-09 14:14:27 +00005816 if (!PyArg_ParseTuple(args, "O|O&O&:find", &substring,
5817 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005818 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005819 substring = PyUnicode_FromObject(substring);
5820 if (!substring)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005821 return NULL;
5822
Thomas Wouters477c8d52006-05-27 19:21:47 +00005823 result = stringlib_find_slice(
5824 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
5825 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
5826 start, end
5827 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00005828
5829 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00005830
5831 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005832}
5833
5834static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +00005835unicode_getitem(PyUnicodeObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005836{
5837 if (index < 0 || index >= self->length) {
5838 PyErr_SetString(PyExc_IndexError, "string index out of range");
5839 return NULL;
5840 }
5841
5842 return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1);
5843}
5844
5845static long
Guido van Rossum09dc34f2007-05-04 04:17:33 +00005846unicode_hash(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005847{
Guido van Rossum09dc34f2007-05-04 04:17:33 +00005848 /* Since Unicode objects compare equal to their UTF-8 string
5849 counterparts, we hash the UTF-8 string. */
5850 PyObject *v = _PyUnicode_AsDefaultEncodedString(self, NULL);
5851 return PyObject_Hash(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005852}
5853
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005854PyDoc_STRVAR(index__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005855"S.index(sub [,start [,end]]) -> int\n\
5856\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005857Like S.find() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005858
5859static PyObject *
5860unicode_index(PyUnicodeObject *self, PyObject *args)
5861{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005862 Py_ssize_t result;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005863 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005864 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005865 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005866
Guido van Rossumb8872e62000-05-09 14:14:27 +00005867 if (!PyArg_ParseTuple(args, "O|O&O&:index", &substring,
5868 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005869 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005870 substring = PyUnicode_FromObject(substring);
5871 if (!substring)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005872 return NULL;
5873
Thomas Wouters477c8d52006-05-27 19:21:47 +00005874 result = stringlib_find_slice(
5875 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
5876 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
5877 start, end
5878 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00005879
5880 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00005881
Guido van Rossumd57fd912000-03-10 22:53:23 +00005882 if (result < 0) {
5883 PyErr_SetString(PyExc_ValueError, "substring not found");
5884 return NULL;
5885 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00005886
Martin v. Löwis18e16552006-02-15 17:27:45 +00005887 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005888}
5889
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005890PyDoc_STRVAR(islower__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005891"S.islower() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005892\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00005893Return True if all cased characters in S are lowercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005894at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005895
5896static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005897unicode_islower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005898{
5899 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5900 register const Py_UNICODE *e;
5901 int cased;
5902
Guido van Rossumd57fd912000-03-10 22:53:23 +00005903 /* Shortcut for single character strings */
5904 if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005905 return PyBool_FromLong(Py_UNICODE_ISLOWER(*p));
Guido van Rossumd57fd912000-03-10 22:53:23 +00005906
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005907 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00005908 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005909 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005910
Guido van Rossumd57fd912000-03-10 22:53:23 +00005911 e = p + PyUnicode_GET_SIZE(self);
5912 cased = 0;
5913 for (; p < e; p++) {
5914 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00005915
Guido van Rossumd57fd912000-03-10 22:53:23 +00005916 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005917 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005918 else if (!cased && Py_UNICODE_ISLOWER(ch))
5919 cased = 1;
5920 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00005921 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005922}
5923
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005924PyDoc_STRVAR(isupper__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005925"S.isupper() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005926\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00005927Return True if all cased characters in S are uppercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005928at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005929
5930static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005931unicode_isupper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005932{
5933 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5934 register const Py_UNICODE *e;
5935 int cased;
5936
Guido van Rossumd57fd912000-03-10 22:53:23 +00005937 /* Shortcut for single character strings */
5938 if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005939 return PyBool_FromLong(Py_UNICODE_ISUPPER(*p) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005940
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005941 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00005942 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005943 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005944
Guido van Rossumd57fd912000-03-10 22:53:23 +00005945 e = p + PyUnicode_GET_SIZE(self);
5946 cased = 0;
5947 for (; p < e; p++) {
5948 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00005949
Guido van Rossumd57fd912000-03-10 22:53:23 +00005950 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005951 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005952 else if (!cased && Py_UNICODE_ISUPPER(ch))
5953 cased = 1;
5954 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00005955 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005956}
5957
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005958PyDoc_STRVAR(istitle__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005959"S.istitle() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005960\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00005961Return True if S is a titlecased string and there is at least one\n\
5962character in S, i.e. upper- and titlecase characters may only\n\
5963follow uncased characters and lowercase characters only cased ones.\n\
5964Return False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005965
5966static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005967unicode_istitle(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005968{
5969 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5970 register const Py_UNICODE *e;
5971 int cased, previous_is_cased;
5972
Guido van Rossumd57fd912000-03-10 22:53:23 +00005973 /* Shortcut for single character strings */
5974 if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005975 return PyBool_FromLong((Py_UNICODE_ISTITLE(*p) != 0) ||
5976 (Py_UNICODE_ISUPPER(*p) != 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +00005977
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005978 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00005979 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005980 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005981
Guido van Rossumd57fd912000-03-10 22:53:23 +00005982 e = p + PyUnicode_GET_SIZE(self);
5983 cased = 0;
5984 previous_is_cased = 0;
5985 for (; p < e; p++) {
5986 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00005987
Guido van Rossumd57fd912000-03-10 22:53:23 +00005988 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
5989 if (previous_is_cased)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005990 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005991 previous_is_cased = 1;
5992 cased = 1;
5993 }
5994 else if (Py_UNICODE_ISLOWER(ch)) {
5995 if (!previous_is_cased)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005996 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005997 previous_is_cased = 1;
5998 cased = 1;
5999 }
6000 else
6001 previous_is_cased = 0;
6002 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006003 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006004}
6005
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006006PyDoc_STRVAR(isspace__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006007"S.isspace() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006008\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006009Return True if all characters in S are whitespace\n\
6010and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006011
6012static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006013unicode_isspace(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006014{
6015 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6016 register const Py_UNICODE *e;
6017
Guido van Rossumd57fd912000-03-10 22:53:23 +00006018 /* Shortcut for single character strings */
6019 if (PyUnicode_GET_SIZE(self) == 1 &&
6020 Py_UNICODE_ISSPACE(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006021 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006022
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006023 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006024 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006025 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006026
Guido van Rossumd57fd912000-03-10 22:53:23 +00006027 e = p + PyUnicode_GET_SIZE(self);
6028 for (; p < e; p++) {
6029 if (!Py_UNICODE_ISSPACE(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006030 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006031 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006032 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006033}
6034
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006035PyDoc_STRVAR(isalpha__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006036"S.isalpha() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006037\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006038Return True if all characters in S are alphabetic\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006039and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006040
6041static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006042unicode_isalpha(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006043{
6044 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6045 register const Py_UNICODE *e;
6046
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006047 /* Shortcut for single character strings */
6048 if (PyUnicode_GET_SIZE(self) == 1 &&
6049 Py_UNICODE_ISALPHA(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006050 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006051
6052 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006053 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006054 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006055
6056 e = p + PyUnicode_GET_SIZE(self);
6057 for (; p < e; p++) {
6058 if (!Py_UNICODE_ISALPHA(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006059 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006060 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006061 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006062}
6063
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006064PyDoc_STRVAR(isalnum__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006065"S.isalnum() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006066\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006067Return True if all characters in S are alphanumeric\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006068and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006069
6070static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006071unicode_isalnum(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006072{
6073 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6074 register const Py_UNICODE *e;
6075
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006076 /* Shortcut for single character strings */
6077 if (PyUnicode_GET_SIZE(self) == 1 &&
6078 Py_UNICODE_ISALNUM(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006079 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006080
6081 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006082 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006083 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006084
6085 e = p + PyUnicode_GET_SIZE(self);
6086 for (; p < e; p++) {
6087 if (!Py_UNICODE_ISALNUM(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006088 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006089 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006090 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006091}
6092
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006093PyDoc_STRVAR(isdecimal__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006094"S.isdecimal() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006095\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00006096Return True if there are only decimal characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006097False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006098
6099static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006100unicode_isdecimal(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006101{
6102 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6103 register const Py_UNICODE *e;
6104
Guido van Rossumd57fd912000-03-10 22:53:23 +00006105 /* Shortcut for single character strings */
6106 if (PyUnicode_GET_SIZE(self) == 1 &&
6107 Py_UNICODE_ISDECIMAL(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006108 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006109
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006110 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006111 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006112 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006113
Guido van Rossumd57fd912000-03-10 22:53:23 +00006114 e = p + PyUnicode_GET_SIZE(self);
6115 for (; p < e; p++) {
6116 if (!Py_UNICODE_ISDECIMAL(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006117 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006118 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006119 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006120}
6121
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006122PyDoc_STRVAR(isdigit__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006123"S.isdigit() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006124\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006125Return True if all characters in S are digits\n\
6126and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006127
6128static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006129unicode_isdigit(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006130{
6131 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6132 register const Py_UNICODE *e;
6133
Guido van Rossumd57fd912000-03-10 22:53:23 +00006134 /* Shortcut for single character strings */
6135 if (PyUnicode_GET_SIZE(self) == 1 &&
6136 Py_UNICODE_ISDIGIT(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006137 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006138
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006139 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006140 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006141 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006142
Guido van Rossumd57fd912000-03-10 22:53:23 +00006143 e = p + PyUnicode_GET_SIZE(self);
6144 for (; p < e; p++) {
6145 if (!Py_UNICODE_ISDIGIT(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006146 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006147 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006148 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006149}
6150
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006151PyDoc_STRVAR(isnumeric__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006152"S.isnumeric() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006153\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00006154Return True if there are only numeric characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006155False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006156
6157static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006158unicode_isnumeric(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006159{
6160 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6161 register const Py_UNICODE *e;
6162
Guido van Rossumd57fd912000-03-10 22:53:23 +00006163 /* Shortcut for single character strings */
6164 if (PyUnicode_GET_SIZE(self) == 1 &&
6165 Py_UNICODE_ISNUMERIC(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006166 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006167
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006168 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006169 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006170 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006171
Guido van Rossumd57fd912000-03-10 22:53:23 +00006172 e = p + PyUnicode_GET_SIZE(self);
6173 for (; p < e; p++) {
6174 if (!Py_UNICODE_ISNUMERIC(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006175 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006176 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006177 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006178}
6179
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006180PyDoc_STRVAR(join__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006181"S.join(sequence) -> unicode\n\
6182\n\
6183Return a string which is the concatenation of the strings in the\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006184sequence. The separator between elements is S.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006185
6186static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006187unicode_join(PyObject *self, PyObject *data)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006188{
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006189 return PyUnicode_Join(self, data);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006190}
6191
Martin v. Löwis18e16552006-02-15 17:27:45 +00006192static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +00006193unicode_length(PyUnicodeObject *self)
6194{
6195 return self->length;
6196}
6197
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006198PyDoc_STRVAR(ljust__doc__,
Hye-Shik Chang974ed7c2004-06-02 16:49:17 +00006199"S.ljust(width[, fillchar]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006200\n\
6201Return S left justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006202done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006203
6204static PyObject *
6205unicode_ljust(PyUnicodeObject *self, PyObject *args)
6206{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006207 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006208 Py_UNICODE fillchar = ' ';
6209
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006210 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006211 return NULL;
6212
Tim Peters7a29bd52001-09-12 03:03:31 +00006213 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006214 Py_INCREF(self);
6215 return (PyObject*) self;
6216 }
6217
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006218 return (PyObject*) pad(self, 0, width - self->length, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006219}
6220
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006221PyDoc_STRVAR(lower__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006222"S.lower() -> unicode\n\
6223\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006224Return a copy of the string S converted to lowercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006225
6226static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006227unicode_lower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006228{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006229 return fixup(self, fixlower);
6230}
6231
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006232#define LEFTSTRIP 0
6233#define RIGHTSTRIP 1
6234#define BOTHSTRIP 2
6235
6236/* Arrays indexed by above */
6237static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
6238
6239#define STRIPNAME(i) (stripformat[i]+3)
6240
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006241/* externally visible for str.strip(unicode) */
6242PyObject *
6243_PyUnicode_XStrip(PyUnicodeObject *self, int striptype, PyObject *sepobj)
6244{
6245 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
Martin v. Löwis18e16552006-02-15 17:27:45 +00006246 Py_ssize_t len = PyUnicode_GET_SIZE(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006247 Py_UNICODE *sep = PyUnicode_AS_UNICODE(sepobj);
Martin v. Löwis18e16552006-02-15 17:27:45 +00006248 Py_ssize_t seplen = PyUnicode_GET_SIZE(sepobj);
6249 Py_ssize_t i, j;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006250
Thomas Wouters477c8d52006-05-27 19:21:47 +00006251 BLOOM_MASK sepmask = make_bloom_mask(sep, seplen);
6252
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006253 i = 0;
6254 if (striptype != RIGHTSTRIP) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00006255 while (i < len && BLOOM_MEMBER(sepmask, s[i], sep, seplen)) {
6256 i++;
6257 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006258 }
6259
6260 j = len;
6261 if (striptype != LEFTSTRIP) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00006262 do {
6263 j--;
6264 } while (j >= i && BLOOM_MEMBER(sepmask, s[j], sep, seplen));
6265 j++;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006266 }
6267
6268 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00006269 Py_INCREF(self);
6270 return (PyObject*)self;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006271 }
6272 else
Thomas Wouters477c8d52006-05-27 19:21:47 +00006273 return PyUnicode_FromUnicode(s+i, j-i);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006274}
6275
Guido van Rossumd57fd912000-03-10 22:53:23 +00006276
6277static PyObject *
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006278do_strip(PyUnicodeObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006279{
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006280 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
Martin v. Löwis18e16552006-02-15 17:27:45 +00006281 Py_ssize_t len = PyUnicode_GET_SIZE(self), i, j;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006282
6283 i = 0;
6284 if (striptype != RIGHTSTRIP) {
6285 while (i < len && Py_UNICODE_ISSPACE(s[i])) {
6286 i++;
6287 }
6288 }
6289
6290 j = len;
6291 if (striptype != LEFTSTRIP) {
6292 do {
6293 j--;
6294 } while (j >= i && Py_UNICODE_ISSPACE(s[j]));
6295 j++;
6296 }
6297
6298 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
6299 Py_INCREF(self);
6300 return (PyObject*)self;
6301 }
6302 else
6303 return PyUnicode_FromUnicode(s+i, j-i);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006304}
6305
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006306
6307static PyObject *
6308do_argstrip(PyUnicodeObject *self, int striptype, PyObject *args)
6309{
6310 PyObject *sep = NULL;
6311
6312 if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
6313 return NULL;
6314
6315 if (sep != NULL && sep != Py_None) {
6316 if (PyUnicode_Check(sep))
6317 return _PyUnicode_XStrip(self, striptype, sep);
6318 else if (PyString_Check(sep)) {
6319 PyObject *res;
6320 sep = PyUnicode_FromObject(sep);
6321 if (sep==NULL)
6322 return NULL;
6323 res = _PyUnicode_XStrip(self, striptype, sep);
6324 Py_DECREF(sep);
6325 return res;
6326 }
6327 else {
6328 PyErr_Format(PyExc_TypeError,
6329 "%s arg must be None, unicode or str",
6330 STRIPNAME(striptype));
6331 return NULL;
6332 }
6333 }
6334
6335 return do_strip(self, striptype);
6336}
6337
6338
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006339PyDoc_STRVAR(strip__doc__,
Neal Norwitzffe33b72003-04-10 22:35:32 +00006340"S.strip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006341\n\
6342Return a copy of the string S with leading and trailing\n\
6343whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00006344If chars is given and not None, remove characters in chars instead.\n\
6345If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006346
6347static PyObject *
6348unicode_strip(PyUnicodeObject *self, PyObject *args)
6349{
6350 if (PyTuple_GET_SIZE(args) == 0)
6351 return do_strip(self, BOTHSTRIP); /* Common case */
6352 else
6353 return do_argstrip(self, BOTHSTRIP, args);
6354}
6355
6356
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006357PyDoc_STRVAR(lstrip__doc__,
Neal Norwitzffe33b72003-04-10 22:35:32 +00006358"S.lstrip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006359\n\
6360Return a copy of the string S with leading whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00006361If chars is given and not None, remove characters in chars instead.\n\
6362If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006363
6364static PyObject *
6365unicode_lstrip(PyUnicodeObject *self, PyObject *args)
6366{
6367 if (PyTuple_GET_SIZE(args) == 0)
6368 return do_strip(self, LEFTSTRIP); /* Common case */
6369 else
6370 return do_argstrip(self, LEFTSTRIP, args);
6371}
6372
6373
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006374PyDoc_STRVAR(rstrip__doc__,
Neal Norwitzffe33b72003-04-10 22:35:32 +00006375"S.rstrip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006376\n\
6377Return a copy of the string S with trailing whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00006378If chars is given and not None, remove characters in chars instead.\n\
6379If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006380
6381static PyObject *
6382unicode_rstrip(PyUnicodeObject *self, PyObject *args)
6383{
6384 if (PyTuple_GET_SIZE(args) == 0)
6385 return do_strip(self, RIGHTSTRIP); /* Common case */
6386 else
6387 return do_argstrip(self, RIGHTSTRIP, args);
6388}
6389
6390
Guido van Rossumd57fd912000-03-10 22:53:23 +00006391static PyObject*
Martin v. Löwis18e16552006-02-15 17:27:45 +00006392unicode_repeat(PyUnicodeObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006393{
6394 PyUnicodeObject *u;
6395 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006396 Py_ssize_t nchars;
Tim Peters8f422462000-09-09 06:13:41 +00006397 size_t nbytes;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006398
6399 if (len < 0)
6400 len = 0;
6401
Tim Peters7a29bd52001-09-12 03:03:31 +00006402 if (len == 1 && PyUnicode_CheckExact(str)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006403 /* no repeat, return original string */
6404 Py_INCREF(str);
6405 return (PyObject*) str;
6406 }
Tim Peters8f422462000-09-09 06:13:41 +00006407
6408 /* ensure # of chars needed doesn't overflow int and # of bytes
6409 * needed doesn't overflow size_t
6410 */
6411 nchars = len * str->length;
6412 if (len && nchars / len != str->length) {
6413 PyErr_SetString(PyExc_OverflowError,
6414 "repeated string is too long");
6415 return NULL;
6416 }
6417 nbytes = (nchars + 1) * sizeof(Py_UNICODE);
6418 if (nbytes / sizeof(Py_UNICODE) != (size_t)(nchars + 1)) {
6419 PyErr_SetString(PyExc_OverflowError,
6420 "repeated string is too long");
6421 return NULL;
6422 }
6423 u = _PyUnicode_New(nchars);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006424 if (!u)
6425 return NULL;
6426
6427 p = u->str;
6428
Thomas Wouters477c8d52006-05-27 19:21:47 +00006429 if (str->length == 1 && len > 0) {
6430 Py_UNICODE_FILL(p, str->str[0], len);
6431 } else {
6432 Py_ssize_t done = 0; /* number of characters copied this far */
6433 if (done < nchars) {
6434 Py_UNICODE_COPY(p, str->str, str->length);
6435 done = str->length;
6436 }
6437 while (done < nchars) {
6438 int n = (done <= nchars-done) ? done : nchars-done;
6439 Py_UNICODE_COPY(p+done, p, n);
6440 done += n;
6441 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006442 }
6443
6444 return (PyObject*) u;
6445}
6446
6447PyObject *PyUnicode_Replace(PyObject *obj,
6448 PyObject *subobj,
6449 PyObject *replobj,
Martin v. Löwis18e16552006-02-15 17:27:45 +00006450 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006451{
6452 PyObject *self;
6453 PyObject *str1;
6454 PyObject *str2;
6455 PyObject *result;
6456
6457 self = PyUnicode_FromObject(obj);
6458 if (self == NULL)
6459 return NULL;
6460 str1 = PyUnicode_FromObject(subobj);
6461 if (str1 == NULL) {
6462 Py_DECREF(self);
6463 return NULL;
6464 }
6465 str2 = PyUnicode_FromObject(replobj);
6466 if (str2 == NULL) {
6467 Py_DECREF(self);
6468 Py_DECREF(str1);
6469 return NULL;
6470 }
Tim Petersced69f82003-09-16 20:30:58 +00006471 result = replace((PyUnicodeObject *)self,
6472 (PyUnicodeObject *)str1,
6473 (PyUnicodeObject *)str2,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006474 maxcount);
6475 Py_DECREF(self);
6476 Py_DECREF(str1);
6477 Py_DECREF(str2);
6478 return result;
6479}
6480
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006481PyDoc_STRVAR(replace__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006482"S.replace (old, new[, maxsplit]) -> unicode\n\
6483\n\
6484Return a copy of S with all occurrences of substring\n\
6485old replaced by new. If the optional argument maxsplit is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006486given, only the first maxsplit occurrences are replaced.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006487
6488static PyObject*
6489unicode_replace(PyUnicodeObject *self, PyObject *args)
6490{
6491 PyUnicodeObject *str1;
6492 PyUnicodeObject *str2;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006493 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006494 PyObject *result;
6495
Martin v. Löwis18e16552006-02-15 17:27:45 +00006496 if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006497 return NULL;
6498 str1 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str1);
6499 if (str1 == NULL)
6500 return NULL;
6501 str2 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str2);
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00006502 if (str2 == NULL) {
6503 Py_DECREF(str1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006504 return NULL;
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00006505 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006506
6507 result = replace(self, str1, str2, maxcount);
6508
6509 Py_DECREF(str1);
6510 Py_DECREF(str2);
6511 return result;
6512}
6513
6514static
6515PyObject *unicode_repr(PyObject *unicode)
6516{
Walter Dörwald79e913e2007-05-12 11:08:06 +00006517 PyObject *repr;
6518 char *p;
6519 Py_UNICODE *s = PyUnicode_AS_UNICODE(unicode);
6520 Py_ssize_t size = PyUnicode_GET_SIZE(unicode);
6521
6522 /* XXX(nnorwitz): rather than over-allocating, it would be
6523 better to choose a different scheme. Perhaps scan the
6524 first N-chars of the string and allocate based on that size.
6525 */
6526 /* Initial allocation is based on the longest-possible unichr
6527 escape.
6528
6529 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
6530 unichr, so in this case it's the longest unichr escape. In
6531 narrow (UTF-16) builds this is five chars per source unichr
6532 since there are two unichrs in the surrogate pair, so in narrow
6533 (UTF-16) builds it's not the longest unichr escape.
6534
6535 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
6536 so in the narrow (UTF-16) build case it's the longest unichr
6537 escape.
6538 */
6539
6540 repr = PyString_FromStringAndSize(NULL,
6541 2 /* quotes */
6542#ifdef Py_UNICODE_WIDE
6543 + 10*size
6544#else
6545 + 6*size
6546#endif
6547 + 1);
6548 if (repr == NULL)
6549 return NULL;
6550
6551 p = PyString_AS_STRING(repr);
6552
6553 /* Add quote */
6554 *p++ = (findchar(s, size, '\'') &&
6555 !findchar(s, size, '"')) ? '"' : '\'';
6556 while (size-- > 0) {
6557 Py_UNICODE ch = *s++;
6558
6559 /* Escape quotes and backslashes */
6560 if ((ch == (Py_UNICODE) PyString_AS_STRING(repr)[0]) || (ch == '\\')) {
6561 *p++ = '\\';
6562 *p++ = (char) ch;
6563 continue;
6564 }
6565
6566#ifdef Py_UNICODE_WIDE
6567 /* Map 21-bit characters to '\U00xxxxxx' */
6568 else if (ch >= 0x10000) {
6569 *p++ = '\\';
6570 *p++ = 'U';
6571 *p++ = hexdigits[(ch >> 28) & 0x0000000F];
6572 *p++ = hexdigits[(ch >> 24) & 0x0000000F];
6573 *p++ = hexdigits[(ch >> 20) & 0x0000000F];
6574 *p++ = hexdigits[(ch >> 16) & 0x0000000F];
6575 *p++ = hexdigits[(ch >> 12) & 0x0000000F];
6576 *p++ = hexdigits[(ch >> 8) & 0x0000000F];
6577 *p++ = hexdigits[(ch >> 4) & 0x0000000F];
6578 *p++ = hexdigits[ch & 0x0000000F];
6579 continue;
6580 }
6581#else
6582 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
6583 else if (ch >= 0xD800 && ch < 0xDC00) {
6584 Py_UNICODE ch2;
6585 Py_UCS4 ucs;
6586
6587 ch2 = *s++;
6588 size--;
6589 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
6590 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
6591 *p++ = '\\';
6592 *p++ = 'U';
6593 *p++ = hexdigits[(ucs >> 28) & 0x0000000F];
6594 *p++ = hexdigits[(ucs >> 24) & 0x0000000F];
6595 *p++ = hexdigits[(ucs >> 20) & 0x0000000F];
6596 *p++ = hexdigits[(ucs >> 16) & 0x0000000F];
6597 *p++ = hexdigits[(ucs >> 12) & 0x0000000F];
6598 *p++ = hexdigits[(ucs >> 8) & 0x0000000F];
6599 *p++ = hexdigits[(ucs >> 4) & 0x0000000F];
6600 *p++ = hexdigits[ucs & 0x0000000F];
6601 continue;
6602 }
6603 /* Fall through: isolated surrogates are copied as-is */
6604 s--;
6605 size++;
6606 }
6607#endif
6608
6609 /* Map 16-bit characters to '\uxxxx' */
6610 if (ch >= 256) {
6611 *p++ = '\\';
6612 *p++ = 'u';
6613 *p++ = hexdigits[(ch >> 12) & 0x000F];
6614 *p++ = hexdigits[(ch >> 8) & 0x000F];
6615 *p++ = hexdigits[(ch >> 4) & 0x000F];
6616 *p++ = hexdigits[ch & 0x000F];
6617 }
6618
6619 /* Map special whitespace to '\t', \n', '\r' */
6620 else if (ch == '\t') {
6621 *p++ = '\\';
6622 *p++ = 't';
6623 }
6624 else if (ch == '\n') {
6625 *p++ = '\\';
6626 *p++ = 'n';
6627 }
6628 else if (ch == '\r') {
6629 *p++ = '\\';
6630 *p++ = 'r';
6631 }
6632
6633 /* Map non-printable US ASCII to '\xhh' */
6634 else if (ch < ' ' || ch >= 0x7F) {
6635 *p++ = '\\';
6636 *p++ = 'x';
6637 *p++ = hexdigits[(ch >> 4) & 0x000F];
6638 *p++ = hexdigits[ch & 0x000F];
6639 }
6640
6641 /* Copy everything else as-is */
6642 else
6643 *p++ = (char) ch;
6644 }
6645 /* Add quote */
6646 *p++ = PyString_AS_STRING(repr)[0];
6647
6648 *p = '\0';
6649 _PyString_Resize(&repr, p - PyString_AS_STRING(repr));
6650 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006651}
6652
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006653PyDoc_STRVAR(rfind__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006654"S.rfind(sub [,start [,end]]) -> int\n\
6655\n\
6656Return the highest index in S where substring sub is found,\n\
6657such that sub is contained within s[start,end]. Optional\n\
6658arguments start and end are interpreted as in slice notation.\n\
6659\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006660Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006661
6662static PyObject *
6663unicode_rfind(PyUnicodeObject *self, PyObject *args)
6664{
Thomas Wouters477c8d52006-05-27 19:21:47 +00006665 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006666 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006667 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006668 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006669
Guido van Rossumb8872e62000-05-09 14:14:27 +00006670 if (!PyArg_ParseTuple(args, "O|O&O&:rfind", &substring,
6671 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006672 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006673 substring = PyUnicode_FromObject(substring);
6674 if (!substring)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006675 return NULL;
6676
Thomas Wouters477c8d52006-05-27 19:21:47 +00006677 result = stringlib_rfind_slice(
6678 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
6679 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
6680 start, end
6681 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006682
6683 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00006684
6685 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006686}
6687
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006688PyDoc_STRVAR(rindex__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006689"S.rindex(sub [,start [,end]]) -> int\n\
6690\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006691Like S.rfind() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006692
6693static PyObject *
6694unicode_rindex(PyUnicodeObject *self, PyObject *args)
6695{
Thomas Wouters477c8d52006-05-27 19:21:47 +00006696 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006697 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006698 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006699 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006700
Guido van Rossumb8872e62000-05-09 14:14:27 +00006701 if (!PyArg_ParseTuple(args, "O|O&O&:rindex", &substring,
6702 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006703 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006704 substring = PyUnicode_FromObject(substring);
6705 if (!substring)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006706 return NULL;
6707
Thomas Wouters477c8d52006-05-27 19:21:47 +00006708 result = stringlib_rfind_slice(
6709 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
6710 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
6711 start, end
6712 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006713
6714 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00006715
Guido van Rossumd57fd912000-03-10 22:53:23 +00006716 if (result < 0) {
6717 PyErr_SetString(PyExc_ValueError, "substring not found");
6718 return NULL;
6719 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00006720 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006721}
6722
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006723PyDoc_STRVAR(rjust__doc__,
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006724"S.rjust(width[, fillchar]) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006725\n\
6726Return S right justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006727done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006728
6729static PyObject *
6730unicode_rjust(PyUnicodeObject *self, PyObject *args)
6731{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006732 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006733 Py_UNICODE fillchar = ' ';
6734
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006735 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006736 return NULL;
6737
Tim Peters7a29bd52001-09-12 03:03:31 +00006738 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006739 Py_INCREF(self);
6740 return (PyObject*) self;
6741 }
6742
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006743 return (PyObject*) pad(self, width - self->length, 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006744}
6745
Guido van Rossumd57fd912000-03-10 22:53:23 +00006746static PyObject*
Martin v. Löwis18e16552006-02-15 17:27:45 +00006747unicode_slice(PyUnicodeObject *self, Py_ssize_t start, Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006748{
6749 /* standard clamping */
6750 if (start < 0)
6751 start = 0;
6752 if (end < 0)
6753 end = 0;
6754 if (end > self->length)
6755 end = self->length;
Tim Peters7a29bd52001-09-12 03:03:31 +00006756 if (start == 0 && end == self->length && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006757 /* full slice, return original string */
6758 Py_INCREF(self);
6759 return (PyObject*) self;
6760 }
6761 if (start > end)
6762 start = end;
6763 /* copy slice */
6764 return (PyObject*) PyUnicode_FromUnicode(self->str + start,
6765 end - start);
6766}
6767
6768PyObject *PyUnicode_Split(PyObject *s,
6769 PyObject *sep,
Martin v. Löwis18e16552006-02-15 17:27:45 +00006770 Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006771{
6772 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00006773
Guido van Rossumd57fd912000-03-10 22:53:23 +00006774 s = PyUnicode_FromObject(s);
6775 if (s == NULL)
6776 return NULL;
6777 if (sep != NULL) {
6778 sep = PyUnicode_FromObject(sep);
6779 if (sep == NULL) {
6780 Py_DECREF(s);
6781 return NULL;
6782 }
6783 }
6784
6785 result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
6786
6787 Py_DECREF(s);
6788 Py_XDECREF(sep);
6789 return result;
6790}
6791
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006792PyDoc_STRVAR(split__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006793"S.split([sep [,maxsplit]]) -> list of strings\n\
6794\n\
6795Return a list of the words in S, using sep as the\n\
6796delimiter string. If maxsplit is given, at most maxsplit\n\
Thomas Hellerca0d2cb2004-09-15 11:41:32 +00006797splits are done. If sep is not specified or is None,\n\
Walter Dörwald782afc52004-09-14 09:40:45 +00006798any whitespace string is a separator.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006799
6800static PyObject*
6801unicode_split(PyUnicodeObject *self, PyObject *args)
6802{
6803 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006804 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006805
Martin v. Löwis18e16552006-02-15 17:27:45 +00006806 if (!PyArg_ParseTuple(args, "|On:split", &substring, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006807 return NULL;
6808
6809 if (substring == Py_None)
6810 return split(self, NULL, maxcount);
6811 else if (PyUnicode_Check(substring))
6812 return split(self, (PyUnicodeObject *)substring, maxcount);
6813 else
6814 return PyUnicode_Split((PyObject *)self, substring, maxcount);
6815}
6816
Thomas Wouters477c8d52006-05-27 19:21:47 +00006817PyObject *
6818PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
6819{
6820 PyObject* str_obj;
6821 PyObject* sep_obj;
6822 PyObject* out;
6823
6824 str_obj = PyUnicode_FromObject(str_in);
6825 if (!str_obj)
6826 return NULL;
6827 sep_obj = PyUnicode_FromObject(sep_in);
6828 if (!sep_obj) {
6829 Py_DECREF(str_obj);
6830 return NULL;
6831 }
6832
6833 out = stringlib_partition(
6834 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
6835 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
6836 );
6837
6838 Py_DECREF(sep_obj);
6839 Py_DECREF(str_obj);
6840
6841 return out;
6842}
6843
6844
6845PyObject *
6846PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
6847{
6848 PyObject* str_obj;
6849 PyObject* sep_obj;
6850 PyObject* out;
6851
6852 str_obj = PyUnicode_FromObject(str_in);
6853 if (!str_obj)
6854 return NULL;
6855 sep_obj = PyUnicode_FromObject(sep_in);
6856 if (!sep_obj) {
6857 Py_DECREF(str_obj);
6858 return NULL;
6859 }
6860
6861 out = stringlib_rpartition(
6862 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
6863 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
6864 );
6865
6866 Py_DECREF(sep_obj);
6867 Py_DECREF(str_obj);
6868
6869 return out;
6870}
6871
6872PyDoc_STRVAR(partition__doc__,
6873"S.partition(sep) -> (head, sep, tail)\n\
6874\n\
6875Searches for the separator sep in S, and returns the part before it,\n\
6876the separator itself, and the part after it. If the separator is not\n\
6877found, returns S and two empty strings.");
6878
6879static PyObject*
6880unicode_partition(PyUnicodeObject *self, PyObject *separator)
6881{
6882 return PyUnicode_Partition((PyObject *)self, separator);
6883}
6884
6885PyDoc_STRVAR(rpartition__doc__,
Thomas Wouters89f507f2006-12-13 04:49:30 +00006886"S.rpartition(sep) -> (tail, sep, head)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00006887\n\
6888Searches for the separator sep in S, starting at the end of S, and returns\n\
6889the part before it, the separator itself, and the part after it. If the\n\
Thomas Wouters89f507f2006-12-13 04:49:30 +00006890separator is not found, returns two empty strings and S.");
Thomas Wouters477c8d52006-05-27 19:21:47 +00006891
6892static PyObject*
6893unicode_rpartition(PyUnicodeObject *self, PyObject *separator)
6894{
6895 return PyUnicode_RPartition((PyObject *)self, separator);
6896}
6897
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006898PyObject *PyUnicode_RSplit(PyObject *s,
6899 PyObject *sep,
Martin v. Löwis18e16552006-02-15 17:27:45 +00006900 Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006901{
6902 PyObject *result;
6903
6904 s = PyUnicode_FromObject(s);
6905 if (s == NULL)
6906 return NULL;
6907 if (sep != NULL) {
6908 sep = PyUnicode_FromObject(sep);
6909 if (sep == NULL) {
6910 Py_DECREF(s);
6911 return NULL;
6912 }
6913 }
6914
6915 result = rsplit((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
6916
6917 Py_DECREF(s);
6918 Py_XDECREF(sep);
6919 return result;
6920}
6921
6922PyDoc_STRVAR(rsplit__doc__,
6923"S.rsplit([sep [,maxsplit]]) -> list of strings\n\
6924\n\
6925Return a list of the words in S, using sep as the\n\
6926delimiter string, starting at the end of the string and\n\
6927working to the front. If maxsplit is given, at most maxsplit\n\
6928splits are done. If sep is not specified, any whitespace string\n\
6929is a separator.");
6930
6931static PyObject*
6932unicode_rsplit(PyUnicodeObject *self, PyObject *args)
6933{
6934 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006935 Py_ssize_t maxcount = -1;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006936
Martin v. Löwis18e16552006-02-15 17:27:45 +00006937 if (!PyArg_ParseTuple(args, "|On:rsplit", &substring, &maxcount))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006938 return NULL;
6939
6940 if (substring == Py_None)
6941 return rsplit(self, NULL, maxcount);
6942 else if (PyUnicode_Check(substring))
6943 return rsplit(self, (PyUnicodeObject *)substring, maxcount);
6944 else
6945 return PyUnicode_RSplit((PyObject *)self, substring, maxcount);
6946}
6947
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006948PyDoc_STRVAR(splitlines__doc__,
Guido van Rossum86662912000-04-11 15:38:46 +00006949"S.splitlines([keepends]]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006950\n\
6951Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +00006952Line breaks are not included in the resulting list unless keepends\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006953is given and true.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006954
6955static PyObject*
6956unicode_splitlines(PyUnicodeObject *self, PyObject *args)
6957{
Guido van Rossum86662912000-04-11 15:38:46 +00006958 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006959
Guido van Rossum86662912000-04-11 15:38:46 +00006960 if (!PyArg_ParseTuple(args, "|i:splitlines", &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006961 return NULL;
6962
Guido van Rossum86662912000-04-11 15:38:46 +00006963 return PyUnicode_Splitlines((PyObject *)self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006964}
6965
6966static
Guido van Rossumf15a29f2007-05-04 00:41:39 +00006967PyObject *unicode_str(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006968{
Guido van Rossumf15a29f2007-05-04 00:41:39 +00006969 PyObject *res = _PyUnicode_AsDefaultEncodedString(self, NULL);
6970 Py_XINCREF(res);
6971 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006972}
6973
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006974PyDoc_STRVAR(swapcase__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006975"S.swapcase() -> unicode\n\
6976\n\
6977Return a copy of S with uppercase characters converted to lowercase\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006978and vice versa.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006979
6980static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006981unicode_swapcase(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006982{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006983 return fixup(self, fixswapcase);
6984}
6985
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006986PyDoc_STRVAR(translate__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006987"S.translate(table) -> unicode\n\
6988\n\
6989Return a copy of the string S, where all characters have been mapped\n\
6990through the given translation table, which must be a mapping of\n\
Walter Dörwald5c1ee172002-09-04 20:31:32 +00006991Unicode ordinals to Unicode ordinals, Unicode strings or None.\n\
6992Unmapped characters are left untouched. Characters mapped to None\n\
6993are deleted.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006994
6995static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006996unicode_translate(PyUnicodeObject *self, PyObject *table)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006997{
Tim Petersced69f82003-09-16 20:30:58 +00006998 return PyUnicode_TranslateCharmap(self->str,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006999 self->length,
Tim Petersced69f82003-09-16 20:30:58 +00007000 table,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007001 "ignore");
7002}
7003
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007004PyDoc_STRVAR(upper__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007005"S.upper() -> unicode\n\
7006\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007007Return a copy of S converted to uppercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007008
7009static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007010unicode_upper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007011{
Guido van Rossumd57fd912000-03-10 22:53:23 +00007012 return fixup(self, fixupper);
7013}
7014
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007015PyDoc_STRVAR(zfill__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007016"S.zfill(width) -> unicode\n\
7017\n\
7018Pad a numeric string x with zeros on the left, to fill a field\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007019of the specified width. The string x is never truncated.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007020
7021static PyObject *
7022unicode_zfill(PyUnicodeObject *self, PyObject *args)
7023{
Martin v. Löwis18e16552006-02-15 17:27:45 +00007024 Py_ssize_t fill;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007025 PyUnicodeObject *u;
7026
Martin v. Löwis18e16552006-02-15 17:27:45 +00007027 Py_ssize_t width;
7028 if (!PyArg_ParseTuple(args, "n:zfill", &width))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007029 return NULL;
7030
7031 if (self->length >= width) {
Walter Dörwald0fe940c2002-04-15 18:42:15 +00007032 if (PyUnicode_CheckExact(self)) {
7033 Py_INCREF(self);
7034 return (PyObject*) self;
7035 }
7036 else
7037 return PyUnicode_FromUnicode(
7038 PyUnicode_AS_UNICODE(self),
7039 PyUnicode_GET_SIZE(self)
7040 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007041 }
7042
7043 fill = width - self->length;
7044
7045 u = pad(self, fill, 0, '0');
7046
Walter Dörwald068325e2002-04-15 13:36:47 +00007047 if (u == NULL)
7048 return NULL;
7049
Guido van Rossumd57fd912000-03-10 22:53:23 +00007050 if (u->str[fill] == '+' || u->str[fill] == '-') {
7051 /* move sign to beginning of string */
7052 u->str[0] = u->str[fill];
7053 u->str[fill] = '0';
7054 }
7055
7056 return (PyObject*) u;
7057}
Guido van Rossumd57fd912000-03-10 22:53:23 +00007058
7059#if 0
7060static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007061unicode_freelistsize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007062{
Guido van Rossumd57fd912000-03-10 22:53:23 +00007063 return PyInt_FromLong(unicode_freelist_size);
7064}
7065#endif
7066
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007067PyDoc_STRVAR(startswith__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00007068"S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007069\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00007070Return True if S starts with the specified prefix, False otherwise.\n\
7071With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007072With optional end, stop comparing S at that position.\n\
7073prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007074
7075static PyObject *
7076unicode_startswith(PyUnicodeObject *self,
7077 PyObject *args)
7078{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007079 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007080 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007081 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007082 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007083 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007084
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007085 if (!PyArg_ParseTuple(args, "O|O&O&:startswith", &subobj,
Guido van Rossumb8872e62000-05-09 14:14:27 +00007086 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007087 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007088 if (PyTuple_Check(subobj)) {
7089 Py_ssize_t i;
7090 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
7091 substring = (PyUnicodeObject *)PyUnicode_FromObject(
7092 PyTuple_GET_ITEM(subobj, i));
7093 if (substring == NULL)
7094 return NULL;
7095 result = tailmatch(self, substring, start, end, -1);
7096 Py_DECREF(substring);
7097 if (result) {
7098 Py_RETURN_TRUE;
7099 }
7100 }
7101 /* nothing matched */
7102 Py_RETURN_FALSE;
7103 }
7104 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007105 if (substring == NULL)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007106 return NULL;
7107 result = tailmatch(self, substring, start, end, -1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007108 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007109 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007110}
7111
7112
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007113PyDoc_STRVAR(endswith__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00007114"S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007115\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00007116Return True if S ends with the specified suffix, False otherwise.\n\
7117With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007118With optional end, stop comparing S at that position.\n\
7119suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007120
7121static PyObject *
7122unicode_endswith(PyUnicodeObject *self,
7123 PyObject *args)
7124{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007125 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007126 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007127 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007128 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007129 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007130
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007131 if (!PyArg_ParseTuple(args, "O|O&O&:endswith", &subobj,
7132 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007133 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007134 if (PyTuple_Check(subobj)) {
7135 Py_ssize_t i;
7136 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
7137 substring = (PyUnicodeObject *)PyUnicode_FromObject(
7138 PyTuple_GET_ITEM(subobj, i));
7139 if (substring == NULL)
7140 return NULL;
7141 result = tailmatch(self, substring, start, end, +1);
7142 Py_DECREF(substring);
7143 if (result) {
7144 Py_RETURN_TRUE;
7145 }
7146 }
7147 Py_RETURN_FALSE;
7148 }
7149 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007150 if (substring == NULL)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007151 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007152
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007153 result = tailmatch(self, substring, start, end, +1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007154 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007155 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007156}
7157
7158
Guido van Rossum5d9113d2003-01-29 17:58:45 +00007159
7160static PyObject *
7161unicode_getnewargs(PyUnicodeObject *v)
7162{
7163 return Py_BuildValue("(u#)", v->str, v->length);
7164}
7165
7166
Guido van Rossumd57fd912000-03-10 22:53:23 +00007167static PyMethodDef unicode_methods[] = {
7168
7169 /* Order is according to common usage: often used methods should
7170 appear first, since lookup is done sequentially. */
7171
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007172 {"encode", (PyCFunction) unicode_encode, METH_VARARGS, encode__doc__},
7173 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
7174 {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007175 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007176 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
7177 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
7178 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
7179 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
7180 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
7181 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
7182 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +00007183 {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007184 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
7185 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
7186 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007187 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00007188 {"decode", (PyCFunction) unicode_decode, METH_VARARGS, decode__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007189/* {"maketrans", (PyCFunction) unicode_maketrans, METH_VARARGS, maketrans__doc__}, */
7190 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
7191 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
7192 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007193 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +00007194 {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007195 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS, splitlines__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007196 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007197 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
7198 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
7199 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
7200 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
7201 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
7202 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
7203 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
7204 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
7205 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
7206 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
7207 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
7208 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
7209 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
7210 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007211 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
Walter Dörwald068325e2002-04-15 13:36:47 +00007212#if 0
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007213 {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
Guido van Rossumd57fd912000-03-10 22:53:23 +00007214#endif
7215
7216#if 0
7217 /* This one is just used for debugging the implementation. */
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007218 {"freelistsize", (PyCFunction) unicode_freelistsize, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00007219#endif
7220
Guido van Rossum5d9113d2003-01-29 17:58:45 +00007221 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00007222 {NULL, NULL}
7223};
7224
Neil Schemenauerce30bc92002-11-18 16:10:18 +00007225static PyObject *
7226unicode_mod(PyObject *v, PyObject *w)
7227{
7228 if (!PyUnicode_Check(v)) {
7229 Py_INCREF(Py_NotImplemented);
7230 return Py_NotImplemented;
7231 }
7232 return PyUnicode_Format(v, w);
7233}
7234
7235static PyNumberMethods unicode_as_number = {
7236 0, /*nb_add*/
7237 0, /*nb_subtract*/
7238 0, /*nb_multiply*/
Neil Schemenauerce30bc92002-11-18 16:10:18 +00007239 unicode_mod, /*nb_remainder*/
7240};
7241
Guido van Rossumd57fd912000-03-10 22:53:23 +00007242static PySequenceMethods unicode_as_sequence = {
Martin v. Löwis18e16552006-02-15 17:27:45 +00007243 (lenfunc) unicode_length, /* sq_length */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007244 PyUnicode_Concat, /* sq_concat */
Martin v. Löwis18e16552006-02-15 17:27:45 +00007245 (ssizeargfunc) unicode_repeat, /* sq_repeat */
7246 (ssizeargfunc) unicode_getitem, /* sq_item */
7247 (ssizessizeargfunc) unicode_slice, /* sq_slice */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007248 0, /* sq_ass_item */
7249 0, /* sq_ass_slice */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007250 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007251};
7252
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007253static PyObject*
7254unicode_subscript(PyUnicodeObject* self, PyObject* item)
7255{
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00007256 if (PyIndex_Check(item)) {
7257 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007258 if (i == -1 && PyErr_Occurred())
7259 return NULL;
7260 if (i < 0)
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007261 i += PyUnicode_GET_SIZE(self);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007262 return unicode_getitem(self, i);
7263 } else if (PySlice_Check(item)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00007264 Py_ssize_t start, stop, step, slicelength, cur, i;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007265 Py_UNICODE* source_buf;
7266 Py_UNICODE* result_buf;
7267 PyObject* result;
7268
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007269 if (PySlice_GetIndicesEx((PySliceObject*)item, PyUnicode_GET_SIZE(self),
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007270 &start, &stop, &step, &slicelength) < 0) {
7271 return NULL;
7272 }
7273
7274 if (slicelength <= 0) {
7275 return PyUnicode_FromUnicode(NULL, 0);
7276 } else {
7277 source_buf = PyUnicode_AS_UNICODE((PyObject*)self);
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007278 result_buf = (Py_UNICODE *)PyMem_MALLOC(slicelength*
7279 sizeof(Py_UNICODE));
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007280
7281 if (result_buf == NULL)
7282 return PyErr_NoMemory();
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007283
7284 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
7285 result_buf[i] = source_buf[cur];
7286 }
Tim Petersced69f82003-09-16 20:30:58 +00007287
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007288 result = PyUnicode_FromUnicode(result_buf, slicelength);
7289 PyMem_FREE(result_buf);
7290 return result;
7291 }
7292 } else {
7293 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
7294 return NULL;
7295 }
7296}
7297
7298static PyMappingMethods unicode_as_mapping = {
Martin v. Löwis18e16552006-02-15 17:27:45 +00007299 (lenfunc)unicode_length, /* mp_length */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007300 (binaryfunc)unicode_subscript, /* mp_subscript */
7301 (objobjargproc)0, /* mp_ass_subscript */
7302};
7303
Martin v. Löwis18e16552006-02-15 17:27:45 +00007304static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +00007305unicode_buffer_getreadbuf(PyUnicodeObject *self,
Martin v. Löwis18e16552006-02-15 17:27:45 +00007306 Py_ssize_t index,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007307 const void **ptr)
7308{
7309 if (index != 0) {
7310 PyErr_SetString(PyExc_SystemError,
7311 "accessing non-existent unicode segment");
7312 return -1;
7313 }
7314 *ptr = (void *) self->str;
7315 return PyUnicode_GET_DATA_SIZE(self);
7316}
7317
Martin v. Löwis18e16552006-02-15 17:27:45 +00007318static Py_ssize_t
7319unicode_buffer_getwritebuf(PyUnicodeObject *self, Py_ssize_t index,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007320 const void **ptr)
7321{
7322 PyErr_SetString(PyExc_TypeError,
Neal Norwitz20e72132002-06-13 21:25:17 +00007323 "cannot use unicode as modifiable buffer");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007324 return -1;
7325}
7326
7327static int
7328unicode_buffer_getsegcount(PyUnicodeObject *self,
Martin v. Löwis18e16552006-02-15 17:27:45 +00007329 Py_ssize_t *lenp)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007330{
7331 if (lenp)
7332 *lenp = PyUnicode_GET_DATA_SIZE(self);
7333 return 1;
7334}
7335
Martin v. Löwiseb079f12006-02-16 14:32:27 +00007336static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +00007337unicode_buffer_getcharbuf(PyUnicodeObject *self,
Martin v. Löwis18e16552006-02-15 17:27:45 +00007338 Py_ssize_t index,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007339 const void **ptr)
7340{
7341 PyObject *str;
Tim Petersced69f82003-09-16 20:30:58 +00007342
Guido van Rossumd57fd912000-03-10 22:53:23 +00007343 if (index != 0) {
7344 PyErr_SetString(PyExc_SystemError,
7345 "accessing non-existent unicode segment");
7346 return -1;
7347 }
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00007348 str = _PyUnicode_AsDefaultEncodedString((PyObject *)self, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007349 if (str == NULL)
7350 return -1;
7351 *ptr = (void *) PyString_AS_STRING(str);
7352 return PyString_GET_SIZE(str);
7353}
7354
7355/* Helpers for PyUnicode_Format() */
7356
7357static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +00007358getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007359{
Martin v. Löwis18e16552006-02-15 17:27:45 +00007360 Py_ssize_t argidx = *p_argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007361 if (argidx < arglen) {
7362 (*p_argidx)++;
7363 if (arglen < 0)
7364 return args;
7365 else
7366 return PyTuple_GetItem(args, argidx);
7367 }
7368 PyErr_SetString(PyExc_TypeError,
7369 "not enough arguments for format string");
7370 return NULL;
7371}
7372
7373#define F_LJUST (1<<0)
7374#define F_SIGN (1<<1)
7375#define F_BLANK (1<<2)
7376#define F_ALT (1<<3)
7377#define F_ZERO (1<<4)
7378
Martin v. Löwis18e16552006-02-15 17:27:45 +00007379static Py_ssize_t
Neal Norwitzfc76d632006-01-10 06:03:13 +00007380strtounicode(Py_UNICODE *buffer, const char *charbuffer)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007381{
Martin v. Löwis18e16552006-02-15 17:27:45 +00007382 register Py_ssize_t i;
7383 Py_ssize_t len = strlen(charbuffer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007384 for (i = len - 1; i >= 0; i--)
7385 buffer[i] = (Py_UNICODE) charbuffer[i];
7386
Guido van Rossumd57fd912000-03-10 22:53:23 +00007387 return len;
7388}
7389
Neal Norwitzfc76d632006-01-10 06:03:13 +00007390static int
7391doubletounicode(Py_UNICODE *buffer, size_t len, const char *format, double x)
7392{
Tim Peters15231542006-02-16 01:08:01 +00007393 Py_ssize_t result;
7394
Neal Norwitzfc76d632006-01-10 06:03:13 +00007395 PyOS_ascii_formatd((char *)buffer, len, format, x);
Tim Peters15231542006-02-16 01:08:01 +00007396 result = strtounicode(buffer, (char *)buffer);
7397 return Py_SAFE_DOWNCAST(result, Py_ssize_t, int);
Neal Norwitzfc76d632006-01-10 06:03:13 +00007398}
7399
7400static int
7401longtounicode(Py_UNICODE *buffer, size_t len, const char *format, long x)
7402{
Tim Peters15231542006-02-16 01:08:01 +00007403 Py_ssize_t result;
7404
Neal Norwitzfc76d632006-01-10 06:03:13 +00007405 PyOS_snprintf((char *)buffer, len, format, x);
Tim Peters15231542006-02-16 01:08:01 +00007406 result = strtounicode(buffer, (char *)buffer);
7407 return Py_SAFE_DOWNCAST(result, Py_ssize_t, int);
Neal Norwitzfc76d632006-01-10 06:03:13 +00007408}
7409
Guido van Rossum078151d2002-08-11 04:24:12 +00007410/* XXX To save some code duplication, formatfloat/long/int could have been
7411 shared with stringobject.c, converting from 8-bit to Unicode after the
7412 formatting is done. */
7413
Guido van Rossumd57fd912000-03-10 22:53:23 +00007414static int
7415formatfloat(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007416 size_t buflen,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007417 int flags,
7418 int prec,
7419 int type,
7420 PyObject *v)
7421{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007422 /* fmt = '%#.' + `prec` + `type`
7423 worst case length = 3 + 10 (len of INT_MAX) + 1 = 14 (use 20)*/
Guido van Rossumd57fd912000-03-10 22:53:23 +00007424 char fmt[20];
7425 double x;
Tim Petersced69f82003-09-16 20:30:58 +00007426
Guido van Rossumd57fd912000-03-10 22:53:23 +00007427 x = PyFloat_AsDouble(v);
7428 if (x == -1.0 && PyErr_Occurred())
7429 return -1;
7430 if (prec < 0)
7431 prec = 6;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007432 if (type == 'f' && (fabs(x) / 1e25) >= 1e25)
7433 type = 'g';
Marc-André Lemburg79f57832002-12-29 19:44:06 +00007434 /* Worst case length calc to ensure no buffer overrun:
7435
7436 'g' formats:
7437 fmt = %#.<prec>g
7438 buf = '-' + [0-9]*prec + '.' + 'e+' + (longest exp
7439 for any double rep.)
7440 len = 1 + prec + 1 + 2 + 5 = 9 + prec
7441
7442 'f' formats:
7443 buf = '-' + [0-9]*x + '.' + [0-9]*prec (with x < 50)
7444 len = 1 + 50 + 1 + prec = 52 + prec
7445
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007446 If prec=0 the effective precision is 1 (the leading digit is
Tim Petersced69f82003-09-16 20:30:58 +00007447 always given), therefore increase the length by one.
Marc-André Lemburg79f57832002-12-29 19:44:06 +00007448
7449 */
7450 if ((type == 'g' && buflen <= (size_t)10 + (size_t)prec) ||
7451 (type == 'f' && buflen <= (size_t)53 + (size_t)prec)) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007452 PyErr_SetString(PyExc_OverflowError,
Marc-André Lemburg79f57832002-12-29 19:44:06 +00007453 "formatted float is too long (precision too large?)");
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007454 return -1;
7455 }
Marc-André Lemburg79f57832002-12-29 19:44:06 +00007456 PyOS_snprintf(fmt, sizeof(fmt), "%%%s.%d%c",
7457 (flags&F_ALT) ? "#" : "",
7458 prec, type);
Neal Norwitzfc76d632006-01-10 06:03:13 +00007459 return doubletounicode(buf, buflen, fmt, x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007460}
7461
Tim Peters38fd5b62000-09-21 05:43:11 +00007462static PyObject*
7463formatlong(PyObject *val, int flags, int prec, int type)
7464{
7465 char *buf;
7466 int i, len;
7467 PyObject *str; /* temporary string object. */
7468 PyUnicodeObject *result;
7469
7470 str = _PyString_FormatLong(val, flags, prec, type, &buf, &len);
7471 if (!str)
7472 return NULL;
7473 result = _PyUnicode_New(len);
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00007474 if (!result) {
7475 Py_DECREF(str);
7476 return NULL;
7477 }
Tim Peters38fd5b62000-09-21 05:43:11 +00007478 for (i = 0; i < len; i++)
7479 result->str[i] = buf[i];
7480 result->str[len] = 0;
7481 Py_DECREF(str);
7482 return (PyObject*)result;
7483}
7484
Guido van Rossumd57fd912000-03-10 22:53:23 +00007485static int
7486formatint(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007487 size_t buflen,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007488 int flags,
7489 int prec,
7490 int type,
7491 PyObject *v)
7492{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007493 /* fmt = '%#.' + `prec` + 'l' + `type`
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00007494 * worst case length = 3 + 19 (worst len of INT_MAX on 64-bit machine)
7495 * + 1 + 1
7496 * = 24
7497 */
Tim Peters38fd5b62000-09-21 05:43:11 +00007498 char fmt[64]; /* plenty big enough! */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00007499 char *sign;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007500 long x;
7501
7502 x = PyInt_AsLong(v);
7503 if (x == -1 && PyErr_Occurred())
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00007504 return -1;
Guido van Rossum6c9e1302003-11-29 23:52:13 +00007505 if (x < 0 && type == 'u') {
7506 type = 'd';
Guido van Rossum078151d2002-08-11 04:24:12 +00007507 }
Guido van Rossum6c9e1302003-11-29 23:52:13 +00007508 if (x < 0 && (type == 'x' || type == 'X' || type == 'o'))
7509 sign = "-";
7510 else
7511 sign = "";
Guido van Rossumd57fd912000-03-10 22:53:23 +00007512 if (prec < 0)
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00007513 prec = 1;
7514
Guido van Rossum6c9e1302003-11-29 23:52:13 +00007515 /* buf = '+'/'-'/'' + '0'/'0x'/'' + '[0-9]'*max(prec, len(x in octal))
7516 * worst case buf = '-0x' + [0-9]*prec, where prec >= 11
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00007517 */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00007518 if (buflen <= 14 || buflen <= (size_t)3 + (size_t)prec) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007519 PyErr_SetString(PyExc_OverflowError,
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00007520 "formatted integer is too long (precision too large?)");
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007521 return -1;
7522 }
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00007523
7524 if ((flags & F_ALT) &&
7525 (type == 'x' || type == 'X')) {
Tim Petersced69f82003-09-16 20:30:58 +00007526 /* When converting under %#x or %#X, there are a number
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00007527 * of issues that cause pain:
7528 * - when 0 is being converted, the C standard leaves off
7529 * the '0x' or '0X', which is inconsistent with other
7530 * %#x/%#X conversions and inconsistent with Python's
7531 * hex() function
7532 * - there are platforms that violate the standard and
7533 * convert 0 with the '0x' or '0X'
7534 * (Metrowerks, Compaq Tru64)
7535 * - there are platforms that give '0x' when converting
Tim Petersced69f82003-09-16 20:30:58 +00007536 * under %#X, but convert 0 in accordance with the
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00007537 * standard (OS/2 EMX)
Tim Petersced69f82003-09-16 20:30:58 +00007538 *
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00007539 * We can achieve the desired consistency by inserting our
7540 * own '0x' or '0X' prefix, and substituting %x/%X in place
7541 * of %#x/%#X.
7542 *
7543 * Note that this is the same approach as used in
7544 * formatint() in stringobject.c
Andrew MacIntyrec4874392002-02-26 11:36:35 +00007545 */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00007546 PyOS_snprintf(fmt, sizeof(fmt), "%s0%c%%.%dl%c",
7547 sign, type, prec, type);
Andrew MacIntyrec4874392002-02-26 11:36:35 +00007548 }
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00007549 else {
Guido van Rossum6c9e1302003-11-29 23:52:13 +00007550 PyOS_snprintf(fmt, sizeof(fmt), "%s%%%s.%dl%c",
7551 sign, (flags&F_ALT) ? "#" : "",
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00007552 prec, type);
Tim Petersb3d8d1f2001-04-28 05:38:26 +00007553 }
Guido van Rossum6c9e1302003-11-29 23:52:13 +00007554 if (sign[0])
Neal Norwitzfc76d632006-01-10 06:03:13 +00007555 return longtounicode(buf, buflen, fmt, -x);
Guido van Rossum6c9e1302003-11-29 23:52:13 +00007556 else
Neal Norwitzfc76d632006-01-10 06:03:13 +00007557 return longtounicode(buf, buflen, fmt, x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007558}
7559
7560static int
7561formatchar(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007562 size_t buflen,
7563 PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007564{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007565 /* presume that the buffer is at least 2 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00007566 if (PyUnicode_Check(v)) {
7567 if (PyUnicode_GET_SIZE(v) != 1)
7568 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007569 buf[0] = PyUnicode_AS_UNICODE(v)[0];
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00007570 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007571
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00007572 else if (PyString_Check(v)) {
Tim Petersced69f82003-09-16 20:30:58 +00007573 if (PyString_GET_SIZE(v) != 1)
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00007574 goto onError;
7575 buf[0] = (Py_UNICODE)PyString_AS_STRING(v)[0];
7576 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007577
7578 else {
7579 /* Integer input truncated to a character */
7580 long x;
7581 x = PyInt_AsLong(v);
7582 if (x == -1 && PyErr_Occurred())
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00007583 goto onError;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00007584#ifdef Py_UNICODE_WIDE
7585 if (x < 0 || x > 0x10ffff) {
Walter Dörwald44f527f2003-04-02 16:37:24 +00007586 PyErr_SetString(PyExc_OverflowError,
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00007587 "%c arg not in range(0x110000) "
7588 "(wide Python build)");
7589 return -1;
7590 }
7591#else
7592 if (x < 0 || x > 0xffff) {
Walter Dörwald44f527f2003-04-02 16:37:24 +00007593 PyErr_SetString(PyExc_OverflowError,
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00007594 "%c arg not in range(0x10000) "
7595 "(narrow Python build)");
7596 return -1;
7597 }
7598#endif
7599 buf[0] = (Py_UNICODE) x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007600 }
7601 buf[1] = '\0';
7602 return 1;
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00007603
7604 onError:
7605 PyErr_SetString(PyExc_TypeError,
7606 "%c requires int or char");
7607 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007608}
7609
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007610/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
7611
7612 FORMATBUFLEN is the length of the buffer in which the floats, ints, &
7613 chars are formatted. XXX This is a magic number. Each formatting
7614 routine does bounds checking to ensure no overflow, but a better
7615 solution may be to malloc a buffer of appropriate size for each
7616 format. For now, the current solution is sufficient.
7617*/
7618#define FORMATBUFLEN (size_t)120
7619
Guido van Rossumd57fd912000-03-10 22:53:23 +00007620PyObject *PyUnicode_Format(PyObject *format,
7621 PyObject *args)
7622{
7623 Py_UNICODE *fmt, *res;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007624 Py_ssize_t fmtcnt, rescnt, reslen, arglen, argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007625 int args_owned = 0;
7626 PyUnicodeObject *result = NULL;
7627 PyObject *dict = NULL;
7628 PyObject *uformat;
Tim Petersced69f82003-09-16 20:30:58 +00007629
Guido van Rossumd57fd912000-03-10 22:53:23 +00007630 if (format == NULL || args == NULL) {
7631 PyErr_BadInternalCall();
7632 return NULL;
7633 }
7634 uformat = PyUnicode_FromObject(format);
Fred Drakee4315f52000-05-09 19:53:39 +00007635 if (uformat == NULL)
7636 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007637 fmt = PyUnicode_AS_UNICODE(uformat);
7638 fmtcnt = PyUnicode_GET_SIZE(uformat);
7639
7640 reslen = rescnt = fmtcnt + 100;
7641 result = _PyUnicode_New(reslen);
7642 if (result == NULL)
7643 goto onError;
7644 res = PyUnicode_AS_UNICODE(result);
7645
7646 if (PyTuple_Check(args)) {
7647 arglen = PyTuple_Size(args);
7648 argidx = 0;
7649 }
7650 else {
7651 arglen = -1;
7652 argidx = -2;
7653 }
Neal Norwitz80a1bf42002-11-12 23:01:12 +00007654 if (args->ob_type->tp_as_mapping && !PyTuple_Check(args) &&
7655 !PyObject_TypeCheck(args, &PyBaseString_Type))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007656 dict = args;
7657
7658 while (--fmtcnt >= 0) {
7659 if (*fmt != '%') {
7660 if (--rescnt < 0) {
7661 rescnt = fmtcnt + 100;
7662 reslen += rescnt;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00007663 if (_PyUnicode_Resize(&result, reslen) < 0)
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00007664 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007665 res = PyUnicode_AS_UNICODE(result) + reslen - rescnt;
7666 --rescnt;
7667 }
7668 *res++ = *fmt++;
7669 }
7670 else {
7671 /* Got a format specifier */
7672 int flags = 0;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007673 Py_ssize_t width = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007674 int prec = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007675 Py_UNICODE c = '\0';
7676 Py_UNICODE fill;
7677 PyObject *v = NULL;
7678 PyObject *temp = NULL;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007679 Py_UNICODE *pbuf;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007680 Py_UNICODE sign;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007681 Py_ssize_t len;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007682 Py_UNICODE formatbuf[FORMATBUFLEN]; /* For format{float,int,char}() */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007683
7684 fmt++;
7685 if (*fmt == '(') {
7686 Py_UNICODE *keystart;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007687 Py_ssize_t keylen;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007688 PyObject *key;
7689 int pcount = 1;
7690
7691 if (dict == NULL) {
7692 PyErr_SetString(PyExc_TypeError,
Tim Petersced69f82003-09-16 20:30:58 +00007693 "format requires a mapping");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007694 goto onError;
7695 }
7696 ++fmt;
7697 --fmtcnt;
7698 keystart = fmt;
7699 /* Skip over balanced parentheses */
7700 while (pcount > 0 && --fmtcnt >= 0) {
7701 if (*fmt == ')')
7702 --pcount;
7703 else if (*fmt == '(')
7704 ++pcount;
7705 fmt++;
7706 }
7707 keylen = fmt - keystart - 1;
7708 if (fmtcnt < 0 || pcount > 0) {
7709 PyErr_SetString(PyExc_ValueError,
7710 "incomplete format key");
7711 goto onError;
7712 }
Marc-André Lemburg72f82132001-11-20 15:18:49 +00007713#if 0
Fred Drakee4315f52000-05-09 19:53:39 +00007714 /* keys are converted to strings using UTF-8 and
Guido van Rossumd57fd912000-03-10 22:53:23 +00007715 then looked up since Python uses strings to hold
7716 variables names etc. in its namespaces and we
Fred Drakee4315f52000-05-09 19:53:39 +00007717 wouldn't want to break common idioms. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007718 key = PyUnicode_EncodeUTF8(keystart,
7719 keylen,
7720 NULL);
Marc-André Lemburg72f82132001-11-20 15:18:49 +00007721#else
7722 key = PyUnicode_FromUnicode(keystart, keylen);
7723#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00007724 if (key == NULL)
7725 goto onError;
7726 if (args_owned) {
7727 Py_DECREF(args);
7728 args_owned = 0;
7729 }
7730 args = PyObject_GetItem(dict, key);
7731 Py_DECREF(key);
7732 if (args == NULL) {
7733 goto onError;
7734 }
7735 args_owned = 1;
7736 arglen = -1;
7737 argidx = -2;
7738 }
7739 while (--fmtcnt >= 0) {
7740 switch (c = *fmt++) {
7741 case '-': flags |= F_LJUST; continue;
7742 case '+': flags |= F_SIGN; continue;
7743 case ' ': flags |= F_BLANK; continue;
7744 case '#': flags |= F_ALT; continue;
7745 case '0': flags |= F_ZERO; continue;
7746 }
7747 break;
7748 }
7749 if (c == '*') {
7750 v = getnextarg(args, arglen, &argidx);
7751 if (v == NULL)
7752 goto onError;
7753 if (!PyInt_Check(v)) {
7754 PyErr_SetString(PyExc_TypeError,
7755 "* wants int");
7756 goto onError;
7757 }
7758 width = PyInt_AsLong(v);
Guido van Rossumddefaf32007-01-14 03:31:43 +00007759 if (width == -1 && PyErr_Occurred())
7760 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007761 if (width < 0) {
7762 flags |= F_LJUST;
7763 width = -width;
7764 }
7765 if (--fmtcnt >= 0)
7766 c = *fmt++;
7767 }
7768 else if (c >= '0' && c <= '9') {
7769 width = c - '0';
7770 while (--fmtcnt >= 0) {
7771 c = *fmt++;
7772 if (c < '0' || c > '9')
7773 break;
7774 if ((width*10) / 10 != width) {
7775 PyErr_SetString(PyExc_ValueError,
7776 "width too big");
7777 goto onError;
7778 }
7779 width = width*10 + (c - '0');
7780 }
7781 }
7782 if (c == '.') {
7783 prec = 0;
7784 if (--fmtcnt >= 0)
7785 c = *fmt++;
7786 if (c == '*') {
7787 v = getnextarg(args, arglen, &argidx);
7788 if (v == NULL)
7789 goto onError;
7790 if (!PyInt_Check(v)) {
7791 PyErr_SetString(PyExc_TypeError,
7792 "* wants int");
7793 goto onError;
7794 }
7795 prec = PyInt_AsLong(v);
Guido van Rossumddefaf32007-01-14 03:31:43 +00007796 if (prec == -1 && PyErr_Occurred())
7797 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007798 if (prec < 0)
7799 prec = 0;
7800 if (--fmtcnt >= 0)
7801 c = *fmt++;
7802 }
7803 else if (c >= '0' && c <= '9') {
7804 prec = c - '0';
7805 while (--fmtcnt >= 0) {
7806 c = Py_CHARMASK(*fmt++);
7807 if (c < '0' || c > '9')
7808 break;
7809 if ((prec*10) / 10 != prec) {
7810 PyErr_SetString(PyExc_ValueError,
7811 "prec too big");
7812 goto onError;
7813 }
7814 prec = prec*10 + (c - '0');
7815 }
7816 }
7817 } /* prec */
7818 if (fmtcnt >= 0) {
7819 if (c == 'h' || c == 'l' || c == 'L') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007820 if (--fmtcnt >= 0)
7821 c = *fmt++;
7822 }
7823 }
7824 if (fmtcnt < 0) {
7825 PyErr_SetString(PyExc_ValueError,
7826 "incomplete format");
7827 goto onError;
7828 }
7829 if (c != '%') {
7830 v = getnextarg(args, arglen, &argidx);
7831 if (v == NULL)
7832 goto onError;
7833 }
7834 sign = 0;
7835 fill = ' ';
7836 switch (c) {
7837
7838 case '%':
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007839 pbuf = formatbuf;
7840 /* presume that buffer length is at least 1 */
7841 pbuf[0] = '%';
Guido van Rossumd57fd912000-03-10 22:53:23 +00007842 len = 1;
7843 break;
7844
7845 case 's':
7846 case 'r':
7847 if (PyUnicode_Check(v) && c == 's') {
7848 temp = v;
7849 Py_INCREF(temp);
7850 }
7851 else {
7852 PyObject *unicode;
7853 if (c == 's')
Marc-André Lemburgd25c6502004-07-23 16:13:25 +00007854 temp = PyObject_Unicode(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007855 else
7856 temp = PyObject_Repr(v);
7857 if (temp == NULL)
7858 goto onError;
Marc-André Lemburgd25c6502004-07-23 16:13:25 +00007859 if (PyUnicode_Check(temp))
7860 /* nothing to do */;
7861 else if (PyString_Check(temp)) {
7862 /* convert to string to Unicode */
Thomas Woutersa96affe2006-03-12 00:29:36 +00007863 unicode = PyUnicode_Decode(PyString_AS_STRING(temp),
Guido van Rossumd57fd912000-03-10 22:53:23 +00007864 PyString_GET_SIZE(temp),
Thomas Woutersa96affe2006-03-12 00:29:36 +00007865 NULL,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007866 "strict");
Thomas Woutersa96affe2006-03-12 00:29:36 +00007867 Py_DECREF(temp);
7868 temp = unicode;
7869 if (temp == NULL)
7870 goto onError;
7871 }
Marc-André Lemburgd25c6502004-07-23 16:13:25 +00007872 else {
7873 Py_DECREF(temp);
7874 PyErr_SetString(PyExc_TypeError,
7875 "%s argument has non-string str()");
7876 goto onError;
7877 }
7878 }
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007879 pbuf = PyUnicode_AS_UNICODE(temp);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007880 len = PyUnicode_GET_SIZE(temp);
7881 if (prec >= 0 && len > prec)
7882 len = prec;
7883 break;
7884
7885 case 'i':
7886 case 'd':
7887 case 'u':
7888 case 'o':
7889 case 'x':
7890 case 'X':
7891 if (c == 'i')
7892 c = 'd';
Tim Petersa3a3a032000-11-30 05:22:44 +00007893 if (PyLong_Check(v)) {
Tim Peters38fd5b62000-09-21 05:43:11 +00007894 temp = formatlong(v, flags, prec, c);
7895 if (!temp)
7896 goto onError;
7897 pbuf = PyUnicode_AS_UNICODE(temp);
7898 len = PyUnicode_GET_SIZE(temp);
Tim Peters38fd5b62000-09-21 05:43:11 +00007899 sign = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007900 }
Tim Peters38fd5b62000-09-21 05:43:11 +00007901 else {
7902 pbuf = formatbuf;
7903 len = formatint(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
7904 flags, prec, c, v);
7905 if (len < 0)
7906 goto onError;
Guido van Rossum6c9e1302003-11-29 23:52:13 +00007907 sign = 1;
Tim Peters38fd5b62000-09-21 05:43:11 +00007908 }
7909 if (flags & F_ZERO)
7910 fill = '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00007911 break;
7912
7913 case 'e':
7914 case 'E':
7915 case 'f':
Raymond Hettinger9bfe5332003-08-27 04:55:52 +00007916 case 'F':
Guido van Rossumd57fd912000-03-10 22:53:23 +00007917 case 'g':
7918 case 'G':
Raymond Hettinger9bfe5332003-08-27 04:55:52 +00007919 if (c == 'F')
7920 c = 'f';
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007921 pbuf = formatbuf;
7922 len = formatfloat(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
7923 flags, prec, c, v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007924 if (len < 0)
7925 goto onError;
7926 sign = 1;
Tim Peters38fd5b62000-09-21 05:43:11 +00007927 if (flags & F_ZERO)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007928 fill = '0';
7929 break;
7930
7931 case 'c':
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007932 pbuf = formatbuf;
7933 len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007934 if (len < 0)
7935 goto onError;
7936 break;
7937
7938 default:
7939 PyErr_Format(PyExc_ValueError,
Andrew M. Kuchling6ca89172000-12-15 13:07:46 +00007940 "unsupported format character '%c' (0x%x) "
Thomas Wouters89f507f2006-12-13 04:49:30 +00007941 "at index %zd",
Tim Petersced69f82003-09-16 20:30:58 +00007942 (31<=c && c<=126) ? (char)c : '?',
Marc-André Lemburg24e53b62002-09-24 09:32:14 +00007943 (int)c,
Thomas Wouters89f507f2006-12-13 04:49:30 +00007944 (Py_ssize_t)(fmt - 1 -
7945 PyUnicode_AS_UNICODE(uformat)));
Guido van Rossumd57fd912000-03-10 22:53:23 +00007946 goto onError;
7947 }
7948 if (sign) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007949 if (*pbuf == '-' || *pbuf == '+') {
7950 sign = *pbuf++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007951 len--;
7952 }
7953 else if (flags & F_SIGN)
7954 sign = '+';
7955 else if (flags & F_BLANK)
7956 sign = ' ';
7957 else
7958 sign = 0;
7959 }
7960 if (width < len)
7961 width = len;
Guido van Rossum049cd6b2002-10-11 00:43:48 +00007962 if (rescnt - (sign != 0) < width) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007963 reslen -= rescnt;
7964 rescnt = width + fmtcnt + 100;
7965 reslen += rescnt;
Guido van Rossum049cd6b2002-10-11 00:43:48 +00007966 if (reslen < 0) {
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00007967 Py_XDECREF(temp);
Thomas Woutersa96affe2006-03-12 00:29:36 +00007968 PyErr_NoMemory();
7969 goto onError;
Guido van Rossum049cd6b2002-10-11 00:43:48 +00007970 }
Thomas Woutersa96affe2006-03-12 00:29:36 +00007971 if (_PyUnicode_Resize(&result, reslen) < 0) {
7972 Py_XDECREF(temp);
7973 goto onError;
7974 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007975 res = PyUnicode_AS_UNICODE(result)
7976 + reslen - rescnt;
7977 }
7978 if (sign) {
7979 if (fill != ' ')
7980 *res++ = sign;
7981 rescnt--;
7982 if (width > len)
7983 width--;
7984 }
Tim Peters38fd5b62000-09-21 05:43:11 +00007985 if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
7986 assert(pbuf[0] == '0');
Tim Petersfff53252001-04-12 18:38:48 +00007987 assert(pbuf[1] == c);
7988 if (fill != ' ') {
7989 *res++ = *pbuf++;
7990 *res++ = *pbuf++;
Tim Peters38fd5b62000-09-21 05:43:11 +00007991 }
Tim Petersfff53252001-04-12 18:38:48 +00007992 rescnt -= 2;
7993 width -= 2;
7994 if (width < 0)
7995 width = 0;
7996 len -= 2;
Tim Peters38fd5b62000-09-21 05:43:11 +00007997 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007998 if (width > len && !(flags & F_LJUST)) {
7999 do {
8000 --rescnt;
8001 *res++ = fill;
8002 } while (--width > len);
8003 }
Tim Peters38fd5b62000-09-21 05:43:11 +00008004 if (fill == ' ') {
8005 if (sign)
8006 *res++ = sign;
Tim Petersfff53252001-04-12 18:38:48 +00008007 if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
Tim Peters38fd5b62000-09-21 05:43:11 +00008008 assert(pbuf[0] == '0');
Tim Petersfff53252001-04-12 18:38:48 +00008009 assert(pbuf[1] == c);
Tim Peters38fd5b62000-09-21 05:43:11 +00008010 *res++ = *pbuf++;
8011 *res++ = *pbuf++;
8012 }
8013 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008014 Py_UNICODE_COPY(res, pbuf, len);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008015 res += len;
8016 rescnt -= len;
8017 while (--width >= len) {
8018 --rescnt;
8019 *res++ = ' ';
8020 }
8021 if (dict && (argidx < arglen) && c != '%') {
8022 PyErr_SetString(PyExc_TypeError,
Raymond Hettinger0ebac972002-05-21 15:14:57 +00008023 "not all arguments converted during string formatting");
Thomas Woutersa96affe2006-03-12 00:29:36 +00008024 Py_XDECREF(temp);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008025 goto onError;
8026 }
8027 Py_XDECREF(temp);
8028 } /* '%' */
8029 } /* until end */
8030 if (argidx < arglen && !dict) {
8031 PyErr_SetString(PyExc_TypeError,
Raymond Hettinger0ebac972002-05-21 15:14:57 +00008032 "not all arguments converted during string formatting");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008033 goto onError;
8034 }
8035
Thomas Woutersa96affe2006-03-12 00:29:36 +00008036 if (_PyUnicode_Resize(&result, reslen - rescnt) < 0)
8037 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008038 if (args_owned) {
8039 Py_DECREF(args);
8040 }
8041 Py_DECREF(uformat);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008042 return (PyObject *)result;
8043
8044 onError:
8045 Py_XDECREF(result);
8046 Py_DECREF(uformat);
8047 if (args_owned) {
8048 Py_DECREF(args);
8049 }
8050 return NULL;
8051}
8052
8053static PyBufferProcs unicode_as_buffer = {
Martin v. Löwis18e16552006-02-15 17:27:45 +00008054 (readbufferproc) unicode_buffer_getreadbuf,
8055 (writebufferproc) unicode_buffer_getwritebuf,
8056 (segcountproc) unicode_buffer_getsegcount,
8057 (charbufferproc) unicode_buffer_getcharbuf,
Guido van Rossumd57fd912000-03-10 22:53:23 +00008058};
8059
Jeremy Hylton938ace62002-07-17 16:30:39 +00008060static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +00008061unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
8062
Tim Peters6d6c1a32001-08-02 04:15:00 +00008063static PyObject *
8064unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
8065{
8066 PyObject *x = NULL;
Martin v. Löwis15e62742006-02-27 16:46:16 +00008067 static char *kwlist[] = {"string", "encoding", "errors", 0};
Tim Peters6d6c1a32001-08-02 04:15:00 +00008068 char *encoding = NULL;
8069 char *errors = NULL;
8070
Guido van Rossume023fe02001-08-30 03:12:59 +00008071 if (type != &PyUnicode_Type)
8072 return unicode_subtype_new(type, args, kwds);
Tim Peters6d6c1a32001-08-02 04:15:00 +00008073 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:unicode",
8074 kwlist, &x, &encoding, &errors))
8075 return NULL;
8076 if (x == NULL)
8077 return (PyObject *)_PyUnicode_New(0);
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00008078 if (encoding == NULL && errors == NULL)
8079 return PyObject_Unicode(x);
8080 else
Tim Peters6d6c1a32001-08-02 04:15:00 +00008081 return PyUnicode_FromEncodedObject(x, encoding, errors);
8082}
8083
Guido van Rossume023fe02001-08-30 03:12:59 +00008084static PyObject *
8085unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
8086{
Tim Petersaf90b3e2001-09-12 05:18:58 +00008087 PyUnicodeObject *tmp, *pnew;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008088 Py_ssize_t n;
Guido van Rossume023fe02001-08-30 03:12:59 +00008089
8090 assert(PyType_IsSubtype(type, &PyUnicode_Type));
8091 tmp = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds);
8092 if (tmp == NULL)
8093 return NULL;
8094 assert(PyUnicode_Check(tmp));
Tim Petersaf90b3e2001-09-12 05:18:58 +00008095 pnew = (PyUnicodeObject *) type->tp_alloc(type, n = tmp->length);
Raymond Hettingerf4667932003-06-28 20:04:25 +00008096 if (pnew == NULL) {
8097 Py_DECREF(tmp);
Guido van Rossume023fe02001-08-30 03:12:59 +00008098 return NULL;
Raymond Hettingerf4667932003-06-28 20:04:25 +00008099 }
Tim Petersaf90b3e2001-09-12 05:18:58 +00008100 pnew->str = PyMem_NEW(Py_UNICODE, n+1);
8101 if (pnew->str == NULL) {
8102 _Py_ForgetReference((PyObject *)pnew);
Neil Schemenauer58aa8612002-04-12 03:07:20 +00008103 PyObject_Del(pnew);
Raymond Hettingerf4667932003-06-28 20:04:25 +00008104 Py_DECREF(tmp);
Neal Norwitzec74f2f2003-02-11 23:05:40 +00008105 return PyErr_NoMemory();
Guido van Rossume023fe02001-08-30 03:12:59 +00008106 }
Tim Petersaf90b3e2001-09-12 05:18:58 +00008107 Py_UNICODE_COPY(pnew->str, tmp->str, n+1);
8108 pnew->length = n;
8109 pnew->hash = tmp->hash;
Guido van Rossume023fe02001-08-30 03:12:59 +00008110 Py_DECREF(tmp);
Tim Petersaf90b3e2001-09-12 05:18:58 +00008111 return (PyObject *)pnew;
Guido van Rossume023fe02001-08-30 03:12:59 +00008112}
8113
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008114PyDoc_STRVAR(unicode_doc,
Tim Peters6d6c1a32001-08-02 04:15:00 +00008115"unicode(string [, encoding[, errors]]) -> object\n\
8116\n\
8117Create a new Unicode object from the given encoded string.\n\
Skip Montanaro35b37a52002-07-26 16:22:46 +00008118encoding defaults to the current default string encoding.\n\
8119errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +00008120
Guido van Rossum50e9fb92006-08-17 05:42:55 +00008121static PyObject *unicode_iter(PyObject *seq);
8122
Guido van Rossumd57fd912000-03-10 22:53:23 +00008123PyTypeObject PyUnicode_Type = {
8124 PyObject_HEAD_INIT(&PyType_Type)
8125 0, /* ob_size */
Guido van Rossum84fc66d2007-05-03 17:18:26 +00008126 "str", /* tp_name */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008127 sizeof(PyUnicodeObject), /* tp_size */
8128 0, /* tp_itemsize */
8129 /* Slots */
Guido van Rossum9475a232001-10-05 20:51:39 +00008130 (destructor)unicode_dealloc, /* tp_dealloc */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008131 0, /* tp_print */
Tim Peters6d6c1a32001-08-02 04:15:00 +00008132 0, /* tp_getattr */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008133 0, /* tp_setattr */
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00008134 0, /* tp_compare */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008135 unicode_repr, /* tp_repr */
Neil Schemenauerce30bc92002-11-18 16:10:18 +00008136 &unicode_as_number, /* tp_as_number */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008137 &unicode_as_sequence, /* tp_as_sequence */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008138 &unicode_as_mapping, /* tp_as_mapping */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008139 (hashfunc) unicode_hash, /* tp_hash*/
8140 0, /* tp_call*/
8141 (reprfunc) unicode_str, /* tp_str */
Tim Peters6d6c1a32001-08-02 04:15:00 +00008142 PyObject_GenericGetAttr, /* tp_getattro */
8143 0, /* tp_setattro */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008144 &unicode_as_buffer, /* tp_as_buffer */
Thomas Wouters27d517b2007-02-25 20:39:11 +00008145 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
8146 Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
Tim Peters6d6c1a32001-08-02 04:15:00 +00008147 unicode_doc, /* tp_doc */
8148 0, /* tp_traverse */
8149 0, /* tp_clear */
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00008150 PyUnicode_RichCompare, /* tp_richcompare */
Tim Peters6d6c1a32001-08-02 04:15:00 +00008151 0, /* tp_weaklistoffset */
Guido van Rossum50e9fb92006-08-17 05:42:55 +00008152 unicode_iter, /* tp_iter */
Tim Peters6d6c1a32001-08-02 04:15:00 +00008153 0, /* tp_iternext */
8154 unicode_methods, /* tp_methods */
8155 0, /* tp_members */
8156 0, /* tp_getset */
Guido van Rossumcacfc072002-05-24 19:01:59 +00008157 &PyBaseString_Type, /* tp_base */
Tim Peters6d6c1a32001-08-02 04:15:00 +00008158 0, /* tp_dict */
8159 0, /* tp_descr_get */
8160 0, /* tp_descr_set */
8161 0, /* tp_dictoffset */
8162 0, /* tp_init */
8163 0, /* tp_alloc */
8164 unicode_new, /* tp_new */
Neil Schemenauer58aa8612002-04-12 03:07:20 +00008165 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008166};
8167
8168/* Initialize the Unicode implementation */
8169
Thomas Wouters78890102000-07-22 19:25:51 +00008170void _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008171{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008172 int i;
8173
Thomas Wouters477c8d52006-05-27 19:21:47 +00008174 /* XXX - move this array to unicodectype.c ? */
8175 Py_UNICODE linebreak[] = {
8176 0x000A, /* LINE FEED */
8177 0x000D, /* CARRIAGE RETURN */
8178 0x001C, /* FILE SEPARATOR */
8179 0x001D, /* GROUP SEPARATOR */
8180 0x001E, /* RECORD SEPARATOR */
8181 0x0085, /* NEXT LINE */
8182 0x2028, /* LINE SEPARATOR */
8183 0x2029, /* PARAGRAPH SEPARATOR */
8184 };
8185
Fred Drakee4315f52000-05-09 19:53:39 +00008186 /* Init the implementation */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008187 unicode_freelist = NULL;
8188 unicode_freelist_size = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008189 unicode_empty = _PyUnicode_New(0);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008190 if (!unicode_empty)
8191 return;
8192
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008193 for (i = 0; i < 256; i++)
8194 unicode_latin1[i] = NULL;
Guido van Rossumcacfc072002-05-24 19:01:59 +00008195 if (PyType_Ready(&PyUnicode_Type) < 0)
8196 Py_FatalError("Can't initialize 'unicode'");
Thomas Wouters477c8d52006-05-27 19:21:47 +00008197
8198 /* initialize the linebreak bloom filter */
8199 bloom_linebreak = make_bloom_mask(
8200 linebreak, sizeof(linebreak) / sizeof(linebreak[0])
8201 );
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008202
8203 PyType_Ready(&EncodingMapType);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008204}
8205
8206/* Finalize the Unicode implementation */
8207
8208void
Thomas Wouters78890102000-07-22 19:25:51 +00008209_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008210{
Barry Warsaw5b4c2282000-10-03 20:45:26 +00008211 PyUnicodeObject *u;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008212 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008213
Guido van Rossum4ae8ef82000-10-03 18:09:04 +00008214 Py_XDECREF(unicode_empty);
8215 unicode_empty = NULL;
Barry Warsaw5b4c2282000-10-03 20:45:26 +00008216
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008217 for (i = 0; i < 256; i++) {
8218 if (unicode_latin1[i]) {
8219 Py_DECREF(unicode_latin1[i]);
8220 unicode_latin1[i] = NULL;
8221 }
8222 }
8223
Barry Warsaw5b4c2282000-10-03 20:45:26 +00008224 for (u = unicode_freelist; u != NULL;) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00008225 PyUnicodeObject *v = u;
8226 u = *(PyUnicodeObject **)u;
Guido van Rossumfd4b9572000-04-10 13:51:10 +00008227 if (v->str)
Guido van Rossumb18618d2000-05-03 23:44:39 +00008228 PyMem_DEL(v->str);
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00008229 Py_XDECREF(v->defenc);
Neil Schemenauer58aa8612002-04-12 03:07:20 +00008230 PyObject_Del(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008231 }
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008232 unicode_freelist = NULL;
8233 unicode_freelist_size = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008234}
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00008235
Guido van Rossum50e9fb92006-08-17 05:42:55 +00008236
8237
8238/********************* Unicode Iterator **************************/
8239
8240typedef struct {
8241 PyObject_HEAD
Guido van Rossum49d6b072006-08-17 21:11:47 +00008242 Py_ssize_t it_index;
Guido van Rossum50e9fb92006-08-17 05:42:55 +00008243 PyUnicodeObject *it_seq; /* Set to NULL when iterator is exhausted */
8244} unicodeiterobject;
8245
8246static void
8247unicodeiter_dealloc(unicodeiterobject *it)
8248{
8249 _PyObject_GC_UNTRACK(it);
8250 Py_XDECREF(it->it_seq);
8251 PyObject_GC_Del(it);
8252}
8253
8254static int
8255unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
8256{
8257 Py_VISIT(it->it_seq);
8258 return 0;
8259}
8260
8261static PyObject *
8262unicodeiter_next(unicodeiterobject *it)
8263{
8264 PyUnicodeObject *seq;
8265 PyObject *item;
8266
8267 assert(it != NULL);
8268 seq = it->it_seq;
8269 if (seq == NULL)
8270 return NULL;
8271 assert(PyUnicode_Check(seq));
8272
8273 if (it->it_index < PyUnicode_GET_SIZE(seq)) {
Guido van Rossum49d6b072006-08-17 21:11:47 +00008274 item = PyUnicode_FromUnicode(
8275 PyUnicode_AS_UNICODE(seq)+it->it_index, 1);
Guido van Rossum50e9fb92006-08-17 05:42:55 +00008276 if (item != NULL)
8277 ++it->it_index;
8278 return item;
8279 }
8280
8281 Py_DECREF(seq);
8282 it->it_seq = NULL;
8283 return NULL;
8284}
8285
8286static PyObject *
8287unicodeiter_len(unicodeiterobject *it)
8288{
8289 Py_ssize_t len = 0;
8290 if (it->it_seq)
8291 len = PyUnicode_GET_SIZE(it->it_seq) - it->it_index;
8292 return PyInt_FromSsize_t(len);
8293}
8294
8295PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
8296
8297static PyMethodDef unicodeiter_methods[] = {
Guido van Rossum49d6b072006-08-17 21:11:47 +00008298 {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
8299 length_hint_doc},
Guido van Rossum50e9fb92006-08-17 05:42:55 +00008300 {NULL, NULL} /* sentinel */
8301};
8302
8303PyTypeObject PyUnicodeIter_Type = {
8304 PyObject_HEAD_INIT(&PyType_Type)
8305 0, /* ob_size */
8306 "unicodeiterator", /* tp_name */
8307 sizeof(unicodeiterobject), /* tp_basicsize */
8308 0, /* tp_itemsize */
8309 /* methods */
Guido van Rossum49d6b072006-08-17 21:11:47 +00008310 (destructor)unicodeiter_dealloc, /* tp_dealloc */
Guido van Rossum50e9fb92006-08-17 05:42:55 +00008311 0, /* tp_print */
8312 0, /* tp_getattr */
8313 0, /* tp_setattr */
8314 0, /* tp_compare */
8315 0, /* tp_repr */
8316 0, /* tp_as_number */
8317 0, /* tp_as_sequence */
8318 0, /* tp_as_mapping */
8319 0, /* tp_hash */
8320 0, /* tp_call */
8321 0, /* tp_str */
8322 PyObject_GenericGetAttr, /* tp_getattro */
8323 0, /* tp_setattro */
8324 0, /* tp_as_buffer */
8325 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
8326 0, /* tp_doc */
8327 (traverseproc)unicodeiter_traverse, /* tp_traverse */
8328 0, /* tp_clear */
8329 0, /* tp_richcompare */
8330 0, /* tp_weaklistoffset */
8331 PyObject_SelfIter, /* tp_iter */
8332 (iternextfunc)unicodeiter_next, /* tp_iternext */
8333 unicodeiter_methods, /* tp_methods */
8334 0,
8335};
8336
8337static PyObject *
8338unicode_iter(PyObject *seq)
8339{
8340 unicodeiterobject *it;
8341
8342 if (!PyUnicode_Check(seq)) {
8343 PyErr_BadInternalCall();
8344 return NULL;
8345 }
8346 it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
8347 if (it == NULL)
8348 return NULL;
8349 it->it_index = 0;
8350 Py_INCREF(seq);
8351 it->it_seq = (PyUnicodeObject *)seq;
8352 _PyObject_GC_TRACK(it);
8353 return (PyObject *)it;
8354}
8355
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008356#ifdef __cplusplus
8357}
8358#endif
8359
8360
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00008361/*
8362Local variables:
8363c-basic-offset: 4
8364indent-tabs-mode: nil
8365End:
8366*/