blob: a0740db1073076a320cb08363a4f473744d134ab [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Fred Drake785d14f2000-05-09 19:54:43 +00004modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
Guido van Rossumd57fd912000-03-10 22:53:23 +00005Unicode Integration Proposal (see file Misc/unicode.txt).
6
Thomas Wouters477c8d52006-05-27 19:21:47 +00007Major speed upgrades to the method implementations at the Reykjavik
8NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
9
Guido van Rossum16b1ad92000-08-03 16:24:25 +000010Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000011
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000012--------------------------------------------------------------------
13The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000014
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000015 Copyright (c) 1999 by Secret Labs AB
16 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000017
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000018By obtaining, using, and/or copying this software and/or its
19associated documentation, you agree that you have read, understood,
20and will comply with the following terms and conditions:
21
22Permission to use, copy, modify, and distribute this software and its
23associated documentation for any purpose and without fee is hereby
24granted, provided that the above copyright notice appears in all
25copies, and that both that copyright notice and this permission notice
26appear in supporting documentation, and that the name of Secret Labs
27AB or the author not be used in advertising or publicity pertaining to
28distribution of the software without specific, written prior
29permission.
30
31SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
32THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
33FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
34ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
35WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
36ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
37OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
38--------------------------------------------------------------------
39
40*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000041
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000042#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000043#include "Python.h"
44
Guido van Rossumd57fd912000-03-10 22:53:23 +000045#include "unicodeobject.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000046#include "ucnhash.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000047
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000048#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000049#include <windows.h>
50#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000051
Guido van Rossumd57fd912000-03-10 22:53:23 +000052/* Limit for the Unicode object free list */
53
54#define MAX_UNICODE_FREELIST_SIZE 1024
55
56/* Limit for the Unicode object free list stay alive optimization.
57
58 The implementation will keep allocated Unicode memory intact for
59 all objects on the free list having a size less than this
Tim Petersced69f82003-09-16 20:30:58 +000060 limit. This reduces malloc() overhead for small Unicode objects.
Guido van Rossumd57fd912000-03-10 22:53:23 +000061
Barry Warsaw51ac5802000-03-20 16:36:48 +000062 At worst this will result in MAX_UNICODE_FREELIST_SIZE *
Guido van Rossumfd4b9572000-04-10 13:51:10 +000063 (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
Guido van Rossumd57fd912000-03-10 22:53:23 +000064 malloc()-overhead) bytes of unused garbage.
65
66 Setting the limit to 0 effectively turns the feature off.
67
Guido van Rossumfd4b9572000-04-10 13:51:10 +000068 Note: This is an experimental feature ! If you get core dumps when
69 using Unicode objects, turn this feature off.
Guido van Rossumd57fd912000-03-10 22:53:23 +000070
71*/
72
Guido van Rossumfd4b9572000-04-10 13:51:10 +000073#define KEEPALIVE_SIZE_LIMIT 9
Guido van Rossumd57fd912000-03-10 22:53:23 +000074
75/* Endianness switches; defaults to little endian */
76
77#ifdef WORDS_BIGENDIAN
78# define BYTEORDER_IS_BIG_ENDIAN
79#else
80# define BYTEORDER_IS_LITTLE_ENDIAN
81#endif
82
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000083/* --- Globals ------------------------------------------------------------
84
85 The globals are initialized by the _PyUnicode_Init() API and should
86 not be used before calling that API.
87
88*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000089
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000090
91#ifdef __cplusplus
92extern "C" {
93#endif
94
Guido van Rossumd57fd912000-03-10 22:53:23 +000095/* Free list for Unicode objects */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000096static PyUnicodeObject *unicode_freelist;
97static int unicode_freelist_size;
Guido van Rossumd57fd912000-03-10 22:53:23 +000098
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000099/* The empty Unicode object is shared to improve performance. */
100static PyUnicodeObject *unicode_empty;
101
102/* Single character Unicode strings in the Latin-1 range are being
103 shared as well. */
104static PyUnicodeObject *unicode_latin1[256];
105
Fred Drakee4315f52000-05-09 19:53:39 +0000106/* Default encoding to use and assume when NULL is passed as encoding
Guido van Rossumf15a29f2007-05-04 00:41:39 +0000107 parameter; it is fixed to "utf-8". Always use the
108 PyUnicode_GetDefaultEncoding() API to access this global. */
109static const char unicode_default_encoding[] = "utf-8";
Fred Drakee4315f52000-05-09 19:53:39 +0000110
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000111Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000112PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000113{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000114#ifdef Py_UNICODE_WIDE
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000115 return 0x10FFFF;
116#else
117 /* This is actually an illegal character, so it should
118 not be passed to unichr. */
119 return 0xFFFF;
120#endif
121}
122
Thomas Wouters477c8d52006-05-27 19:21:47 +0000123/* --- Bloom Filters ----------------------------------------------------- */
124
125/* stuff to implement simple "bloom filters" for Unicode characters.
126 to keep things simple, we use a single bitmask, using the least 5
127 bits from each unicode characters as the bit index. */
128
129/* the linebreak mask is set up by Unicode_Init below */
130
131#define BLOOM_MASK unsigned long
132
133static BLOOM_MASK bloom_linebreak;
134
135#define BLOOM(mask, ch) ((mask & (1 << ((ch) & 0x1F))))
136
137#define BLOOM_LINEBREAK(ch)\
138 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK((ch)))
139
140Py_LOCAL_INLINE(BLOOM_MASK) make_bloom_mask(Py_UNICODE* ptr, Py_ssize_t len)
141{
142 /* calculate simple bloom-style bitmask for a given unicode string */
143
144 long mask;
145 Py_ssize_t i;
146
147 mask = 0;
148 for (i = 0; i < len; i++)
149 mask |= (1 << (ptr[i] & 0x1F));
150
151 return mask;
152}
153
154Py_LOCAL_INLINE(int) unicode_member(Py_UNICODE chr, Py_UNICODE* set, Py_ssize_t setlen)
155{
156 Py_ssize_t i;
157
158 for (i = 0; i < setlen; i++)
159 if (set[i] == chr)
160 return 1;
161
162 return 0;
163}
164
165#define BLOOM_MEMBER(mask, chr, set, setlen)\
166 BLOOM(mask, chr) && unicode_member(chr, set, setlen)
167
Guido van Rossumd57fd912000-03-10 22:53:23 +0000168/* --- Unicode Object ----------------------------------------------------- */
169
170static
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000171int unicode_resize(register PyUnicodeObject *unicode,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000172 Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000173{
174 void *oldstr;
Tim Petersced69f82003-09-16 20:30:58 +0000175
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000176 /* Shortcut if there's nothing much to do. */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000177 if (unicode->length == length)
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000178 goto reset;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000179
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000180 /* Resizing shared object (unicode_empty or single character
181 objects) in-place is not allowed. Use PyUnicode_Resize()
182 instead ! */
Thomas Wouters477c8d52006-05-27 19:21:47 +0000183
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000184 if (unicode == unicode_empty ||
185 (unicode->length == 1 &&
186 unicode->str[0] < 256U &&
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000187 unicode_latin1[unicode->str[0]] == unicode)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000188 PyErr_SetString(PyExc_SystemError,
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000189 "can't resize shared unicode objects");
Guido van Rossumd57fd912000-03-10 22:53:23 +0000190 return -1;
191 }
192
Thomas Wouters477c8d52006-05-27 19:21:47 +0000193 /* We allocate one more byte to make sure the string is Ux0000 terminated.
194 The overallocation is also used by fastsearch, which assumes that it's
195 safe to look at str[length] (without making any assumptions about what
196 it contains). */
197
Guido van Rossumd57fd912000-03-10 22:53:23 +0000198 oldstr = unicode->str;
199 PyMem_RESIZE(unicode->str, Py_UNICODE, length + 1);
200 if (!unicode->str) {
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000201 unicode->str = (Py_UNICODE *)oldstr;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000202 PyErr_NoMemory();
203 return -1;
204 }
205 unicode->str[length] = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000206 unicode->length = length;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000207
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000208 reset:
Guido van Rossumd57fd912000-03-10 22:53:23 +0000209 /* Reset the object caches */
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000210 if (unicode->defenc) {
211 Py_DECREF(unicode->defenc);
212 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000213 }
214 unicode->hash = -1;
Tim Petersced69f82003-09-16 20:30:58 +0000215
Guido van Rossumd57fd912000-03-10 22:53:23 +0000216 return 0;
217}
218
219/* We allocate one more byte to make sure the string is
Tim Petersced69f82003-09-16 20:30:58 +0000220 Ux0000 terminated -- XXX is this needed ?
Guido van Rossumd57fd912000-03-10 22:53:23 +0000221
222 XXX This allocator could further be enhanced by assuring that the
223 free list never reduces its size below 1.
224
225*/
226
227static
Martin v. Löwis18e16552006-02-15 17:27:45 +0000228PyUnicodeObject *_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000229{
230 register PyUnicodeObject *unicode;
231
Thomas Wouters477c8d52006-05-27 19:21:47 +0000232 /* Optimization for empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000233 if (length == 0 && unicode_empty != NULL) {
234 Py_INCREF(unicode_empty);
235 return unicode_empty;
236 }
237
238 /* Unicode freelist & memory allocation */
239 if (unicode_freelist) {
240 unicode = unicode_freelist;
Marc-André Lemburgbea47e72000-06-17 20:31:17 +0000241 unicode_freelist = *(PyUnicodeObject **)unicode;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000242 unicode_freelist_size--;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000243 if (unicode->str) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000244 /* Keep-Alive optimization: we only upsize the buffer,
245 never downsize it. */
Tim Petersced69f82003-09-16 20:30:58 +0000246 if ((unicode->length < length) &&
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +0000247 unicode_resize(unicode, length) < 0) {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000248 PyMem_DEL(unicode->str);
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000249 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000250 }
251 }
Guido van Rossumad98db12001-06-14 17:52:02 +0000252 else {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000253 unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
Guido van Rossumad98db12001-06-14 17:52:02 +0000254 }
255 PyObject_INIT(unicode, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000256 }
257 else {
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000258 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000259 if (unicode == NULL)
260 return NULL;
261 unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
262 }
263
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000264 if (!unicode->str) {
265 PyErr_NoMemory();
Barry Warsaw51ac5802000-03-20 16:36:48 +0000266 goto onError;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000267 }
Jeremy Hyltond8082792003-09-16 19:41:39 +0000268 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +0000269 * the caller fails before initializing str -- unicode_resize()
270 * reads str[0], and the Keep-Alive optimization can keep memory
271 * allocated for str alive across a call to unicode_dealloc(unicode).
272 * We don't want unicode_resize to read uninitialized memory in
273 * that case.
274 */
Jeremy Hyltond8082792003-09-16 19:41:39 +0000275 unicode->str[0] = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000276 unicode->str[length] = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000277 unicode->length = length;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000278 unicode->hash = -1;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000279 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000280 return unicode;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000281
282 onError:
283 _Py_ForgetReference((PyObject *)unicode);
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000284 PyObject_Del(unicode);
Barry Warsaw51ac5802000-03-20 16:36:48 +0000285 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000286}
287
288static
Guido van Rossum9475a232001-10-05 20:51:39 +0000289void unicode_dealloc(register PyUnicodeObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000290{
Guido van Rossum604ddf82001-12-06 20:03:56 +0000291 if (PyUnicode_CheckExact(unicode) &&
292 unicode_freelist_size < MAX_UNICODE_FREELIST_SIZE) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000293 /* Keep-Alive optimization */
294 if (unicode->length >= KEEPALIVE_SIZE_LIMIT) {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000295 PyMem_DEL(unicode->str);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000296 unicode->str = NULL;
297 unicode->length = 0;
298 }
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000299 if (unicode->defenc) {
300 Py_DECREF(unicode->defenc);
301 unicode->defenc = NULL;
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000302 }
303 /* Add to free list */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000304 *(PyUnicodeObject **)unicode = unicode_freelist;
305 unicode_freelist = unicode;
306 unicode_freelist_size++;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000307 }
308 else {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000309 PyMem_DEL(unicode->str);
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000310 Py_XDECREF(unicode->defenc);
Guido van Rossum604ddf82001-12-06 20:03:56 +0000311 unicode->ob_type->tp_free((PyObject *)unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000312 }
313}
314
Martin v. Löwis18e16552006-02-15 17:27:45 +0000315int PyUnicode_Resize(PyObject **unicode, Py_ssize_t length)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000316{
317 register PyUnicodeObject *v;
318
319 /* Argument checks */
320 if (unicode == NULL) {
321 PyErr_BadInternalCall();
322 return -1;
323 }
324 v = (PyUnicodeObject *)*unicode;
Guido van Rossum049cd6b2002-10-11 00:43:48 +0000325 if (v == NULL || !PyUnicode_Check(v) || v->ob_refcnt != 1 || length < 0) {
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000326 PyErr_BadInternalCall();
327 return -1;
328 }
329
330 /* Resizing unicode_empty and single character objects is not
331 possible since these are being shared. We simply return a fresh
332 copy with the same Unicode content. */
Tim Petersced69f82003-09-16 20:30:58 +0000333 if (v->length != length &&
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000334 (v == unicode_empty || v->length == 1)) {
335 PyUnicodeObject *w = _PyUnicode_New(length);
336 if (w == NULL)
337 return -1;
338 Py_UNICODE_COPY(w->str, v->str,
339 length < v->length ? length : v->length);
Raymond Hettingerc8df5782003-03-09 07:30:43 +0000340 Py_DECREF(*unicode);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000341 *unicode = (PyObject *)w;
342 return 0;
343 }
344
345 /* Note that we don't have to modify *unicode for unshared Unicode
346 objects, since we can modify them in-place. */
347 return unicode_resize(v, length);
348}
349
350/* Internal API for use in unicodeobject.c only ! */
351#define _PyUnicode_Resize(unicodevar, length) \
352 PyUnicode_Resize(((PyObject **)(unicodevar)), length)
353
Guido van Rossumd57fd912000-03-10 22:53:23 +0000354PyObject *PyUnicode_FromUnicode(const Py_UNICODE *u,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000355 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000356{
357 PyUnicodeObject *unicode;
358
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000359 /* If the Unicode data is known at construction time, we can apply
360 some optimizations which share commonly used objects. */
361 if (u != NULL) {
362
363 /* Optimization for empty strings */
364 if (size == 0 && unicode_empty != NULL) {
365 Py_INCREF(unicode_empty);
366 return (PyObject *)unicode_empty;
367 }
368
369 /* Single character Unicode objects in the Latin-1 range are
370 shared when using this constructor */
371 if (size == 1 && *u < 256) {
372 unicode = unicode_latin1[*u];
373 if (!unicode) {
374 unicode = _PyUnicode_New(1);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000375 if (!unicode)
376 return NULL;
Marc-André Lemburg8879a332001-06-07 12:26:56 +0000377 unicode->str[0] = *u;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000378 unicode_latin1[*u] = unicode;
379 }
380 Py_INCREF(unicode);
381 return (PyObject *)unicode;
382 }
383 }
Tim Petersced69f82003-09-16 20:30:58 +0000384
Guido van Rossumd57fd912000-03-10 22:53:23 +0000385 unicode = _PyUnicode_New(size);
386 if (!unicode)
387 return NULL;
388
389 /* Copy the Unicode data into the new object */
390 if (u != NULL)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000391 Py_UNICODE_COPY(unicode->str, u, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000392
393 return (PyObject *)unicode;
394}
395
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000396PyObject *PyUnicode_FromString(const char *u)
397{
398 PyUnicodeObject *unicode;
Walter Dörwalda14c4bb2007-05-06 10:00:02 +0000399 size_t size = strlen(u);
400 if (size > PY_SSIZE_T_MAX) {
401 PyErr_SetString(PyExc_OverflowError, "input too long");
402 return NULL;
403 }
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000404
405 /* If the Unicode data is known at construction time, we can apply
406 some optimizations which share commonly used objects. */
407 if (u != NULL) {
408
409 /* Optimization for empty strings */
410 if (size == 0 && unicode_empty != NULL) {
411 Py_INCREF(unicode_empty);
412 return (PyObject *)unicode_empty;
413 }
414
Walter Dörwald071b9da2007-05-05 14:21:20 +0000415 /* Single characters are shared when using this constructor */
416 if (size == 1) {
Walter Dörwaldce32db32007-05-05 14:26:59 +0000417 unicode = unicode_latin1[(int)*u];
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000418 if (!unicode) {
419 unicode = _PyUnicode_New(1);
420 if (!unicode)
421 return NULL;
422 unicode->str[0] = *u;
Walter Dörwaldce32db32007-05-05 14:26:59 +0000423 unicode_latin1[(int)*u] = unicode;
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000424 }
425 Py_INCREF(unicode);
426 return (PyObject *)unicode;
427 }
428 }
429
430 unicode = _PyUnicode_New(size);
431 if (!unicode)
432 return NULL;
433
434 /* Copy the Unicode data into the new object */
435 if (u != NULL) {
Walter Dörwaldce32db32007-05-05 14:26:59 +0000436 Py_UNICODE *p = unicode->str;
437 while ((*p++ = *u++))
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000438 ;
439 }
440
441 return (PyObject *)unicode;
442}
443
Guido van Rossumd57fd912000-03-10 22:53:23 +0000444#ifdef HAVE_WCHAR_H
445
446PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000447 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000448{
449 PyUnicodeObject *unicode;
450
451 if (w == NULL) {
452 PyErr_BadInternalCall();
453 return NULL;
454 }
455
456 unicode = _PyUnicode_New(size);
457 if (!unicode)
458 return NULL;
459
460 /* Copy the wchar_t data into the new object */
461#ifdef HAVE_USABLE_WCHAR_T
462 memcpy(unicode->str, w, size * sizeof(wchar_t));
Tim Petersced69f82003-09-16 20:30:58 +0000463#else
Guido van Rossumd57fd912000-03-10 22:53:23 +0000464 {
465 register Py_UNICODE *u;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000466 register Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000467 u = PyUnicode_AS_UNICODE(unicode);
Marc-André Lemburg204bd6d2004-10-15 07:45:05 +0000468 for (i = size; i > 0; i--)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000469 *u++ = *w++;
470 }
471#endif
472
473 return (PyObject *)unicode;
474}
475
Martin v. Löwis18e16552006-02-15 17:27:45 +0000476Py_ssize_t PyUnicode_AsWideChar(PyUnicodeObject *unicode,
477 wchar_t *w,
478 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000479{
480 if (unicode == NULL) {
481 PyErr_BadInternalCall();
482 return -1;
483 }
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +0000484
485 /* If possible, try to copy the 0-termination as well */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000486 if (size > PyUnicode_GET_SIZE(unicode))
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +0000487 size = PyUnicode_GET_SIZE(unicode) + 1;
488
Guido van Rossumd57fd912000-03-10 22:53:23 +0000489#ifdef HAVE_USABLE_WCHAR_T
490 memcpy(w, unicode->str, size * sizeof(wchar_t));
491#else
492 {
493 register Py_UNICODE *u;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000494 register Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000495 u = PyUnicode_AS_UNICODE(unicode);
Marc-André Lemburg204bd6d2004-10-15 07:45:05 +0000496 for (i = size; i > 0; i--)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000497 *w++ = *u++;
498 }
499#endif
500
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +0000501 if (size > PyUnicode_GET_SIZE(unicode))
502 return PyUnicode_GET_SIZE(unicode);
503 else
Guido van Rossumd57fd912000-03-10 22:53:23 +0000504 return size;
505}
506
507#endif
508
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000509PyObject *PyUnicode_FromOrdinal(int ordinal)
510{
Hye-Shik Chang40574832004-04-06 07:24:51 +0000511 Py_UNICODE s[1];
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000512
513#ifdef Py_UNICODE_WIDE
514 if (ordinal < 0 || ordinal > 0x10ffff) {
515 PyErr_SetString(PyExc_ValueError,
516 "unichr() arg not in range(0x110000) "
517 "(wide Python build)");
518 return NULL;
519 }
520#else
521 if (ordinal < 0 || ordinal > 0xffff) {
522 PyErr_SetString(PyExc_ValueError,
523 "unichr() arg not in range(0x10000) "
524 "(narrow Python build)");
525 return NULL;
526 }
527#endif
528
Hye-Shik Chang40574832004-04-06 07:24:51 +0000529 s[0] = (Py_UNICODE)ordinal;
530 return PyUnicode_FromUnicode(s, 1);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000531}
532
Guido van Rossumd57fd912000-03-10 22:53:23 +0000533PyObject *PyUnicode_FromObject(register PyObject *obj)
534{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000535 /* XXX Perhaps we should make this API an alias of
536 PyObject_Unicode() instead ?! */
537 if (PyUnicode_CheckExact(obj)) {
538 Py_INCREF(obj);
539 return obj;
540 }
541 if (PyUnicode_Check(obj)) {
542 /* For a Unicode subtype that's not a Unicode object,
543 return a true Unicode object with the same data. */
544 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(obj),
545 PyUnicode_GET_SIZE(obj));
546 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000547 return PyUnicode_FromEncodedObject(obj, NULL, "strict");
548}
549
550PyObject *PyUnicode_FromEncodedObject(register PyObject *obj,
551 const char *encoding,
552 const char *errors)
553{
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000554 const char *s = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +0000555 Py_ssize_t len;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000556 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +0000557
Guido van Rossumd57fd912000-03-10 22:53:23 +0000558 if (obj == NULL) {
559 PyErr_BadInternalCall();
560 return NULL;
561 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000562
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000563#if 0
564 /* For b/w compatibility we also accept Unicode objects provided
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +0000565 that no encodings is given and then redirect to
566 PyObject_Unicode() which then applies the additional logic for
567 Unicode subclasses.
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000568
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000569 NOTE: This API should really only be used for object which
570 represent *encoded* Unicode !
571
572 */
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000573 if (PyUnicode_Check(obj)) {
574 if (encoding) {
575 PyErr_SetString(PyExc_TypeError,
576 "decoding Unicode is not supported");
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000577 return NULL;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000578 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000579 return PyObject_Unicode(obj);
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000580 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000581#else
582 if (PyUnicode_Check(obj)) {
583 PyErr_SetString(PyExc_TypeError,
584 "decoding Unicode is not supported");
585 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000586 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000587#endif
588
589 /* Coerce object */
590 if (PyString_Check(obj)) {
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000591 s = PyString_AS_STRING(obj);
592 len = PyString_GET_SIZE(obj);
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000593 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000594 else if (PyObject_AsCharBuffer(obj, &s, &len)) {
595 /* Overwrite the error message with something more useful in
596 case of a TypeError. */
597 if (PyErr_ExceptionMatches(PyExc_TypeError))
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000598 PyErr_Format(PyExc_TypeError,
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000599 "coercing to Unicode: need string or buffer, "
600 "%.80s found",
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000601 obj->ob_type->tp_name);
602 goto onError;
603 }
Tim Petersced69f82003-09-16 20:30:58 +0000604
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000605 /* Convert to Unicode */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000606 if (len == 0) {
607 Py_INCREF(unicode_empty);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000608 v = (PyObject *)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000609 }
Tim Petersced69f82003-09-16 20:30:58 +0000610 else
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000611 v = PyUnicode_Decode(s, len, encoding, errors);
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +0000612
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000613 return v;
614
615 onError:
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000616 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000617}
618
619PyObject *PyUnicode_Decode(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000620 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +0000621 const char *encoding,
622 const char *errors)
623{
624 PyObject *buffer = NULL, *unicode;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000625
626 if (encoding == NULL)
Fred Drakee4315f52000-05-09 19:53:39 +0000627 encoding = PyUnicode_GetDefaultEncoding();
628
629 /* Shortcuts for common default encodings */
630 if (strcmp(encoding, "utf-8") == 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000631 return PyUnicode_DecodeUTF8(s, size, errors);
Fred Drakee4315f52000-05-09 19:53:39 +0000632 else if (strcmp(encoding, "latin-1") == 0)
633 return PyUnicode_DecodeLatin1(s, size, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +0000634#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
635 else if (strcmp(encoding, "mbcs") == 0)
636 return PyUnicode_DecodeMBCS(s, size, errors);
637#endif
Fred Drakee4315f52000-05-09 19:53:39 +0000638 else if (strcmp(encoding, "ascii") == 0)
639 return PyUnicode_DecodeASCII(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000640
641 /* Decode via the codec registry */
642 buffer = PyBuffer_FromMemory((void *)s, size);
643 if (buffer == NULL)
644 goto onError;
645 unicode = PyCodec_Decode(buffer, encoding, errors);
646 if (unicode == NULL)
647 goto onError;
648 if (!PyUnicode_Check(unicode)) {
649 PyErr_Format(PyExc_TypeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000650 "decoder did not return an unicode object (type=%.400s)",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000651 unicode->ob_type->tp_name);
652 Py_DECREF(unicode);
653 goto onError;
654 }
655 Py_DECREF(buffer);
656 return unicode;
Tim Petersced69f82003-09-16 20:30:58 +0000657
Guido van Rossumd57fd912000-03-10 22:53:23 +0000658 onError:
659 Py_XDECREF(buffer);
660 return NULL;
661}
662
Marc-André Lemburgd2d45982004-07-08 17:57:32 +0000663PyObject *PyUnicode_AsDecodedObject(PyObject *unicode,
664 const char *encoding,
665 const char *errors)
666{
667 PyObject *v;
668
669 if (!PyUnicode_Check(unicode)) {
670 PyErr_BadArgument();
671 goto onError;
672 }
673
674 if (encoding == NULL)
675 encoding = PyUnicode_GetDefaultEncoding();
676
677 /* Decode via the codec registry */
678 v = PyCodec_Decode(unicode, encoding, errors);
679 if (v == NULL)
680 goto onError;
681 return v;
682
683 onError:
684 return NULL;
685}
686
Guido van Rossumd57fd912000-03-10 22:53:23 +0000687PyObject *PyUnicode_Encode(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000688 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +0000689 const char *encoding,
690 const char *errors)
691{
692 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +0000693
Guido van Rossumd57fd912000-03-10 22:53:23 +0000694 unicode = PyUnicode_FromUnicode(s, size);
695 if (unicode == NULL)
696 return NULL;
697 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
698 Py_DECREF(unicode);
699 return v;
700}
701
Marc-André Lemburgd2d45982004-07-08 17:57:32 +0000702PyObject *PyUnicode_AsEncodedObject(PyObject *unicode,
703 const char *encoding,
704 const char *errors)
705{
706 PyObject *v;
707
708 if (!PyUnicode_Check(unicode)) {
709 PyErr_BadArgument();
710 goto onError;
711 }
712
713 if (encoding == NULL)
714 encoding = PyUnicode_GetDefaultEncoding();
715
716 /* Encode via the codec registry */
717 v = PyCodec_Encode(unicode, encoding, errors);
718 if (v == NULL)
719 goto onError;
720 return v;
721
722 onError:
723 return NULL;
724}
725
Guido van Rossumd57fd912000-03-10 22:53:23 +0000726PyObject *PyUnicode_AsEncodedString(PyObject *unicode,
727 const char *encoding,
728 const char *errors)
729{
730 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +0000731
Guido van Rossumd57fd912000-03-10 22:53:23 +0000732 if (!PyUnicode_Check(unicode)) {
733 PyErr_BadArgument();
734 goto onError;
735 }
Fred Drakee4315f52000-05-09 19:53:39 +0000736
Tim Petersced69f82003-09-16 20:30:58 +0000737 if (encoding == NULL)
Fred Drakee4315f52000-05-09 19:53:39 +0000738 encoding = PyUnicode_GetDefaultEncoding();
739
740 /* Shortcuts for common default encodings */
741 if (errors == NULL) {
742 if (strcmp(encoding, "utf-8") == 0)
Jeremy Hylton9cea41c2001-05-29 17:13:15 +0000743 return PyUnicode_AsUTF8String(unicode);
Fred Drakee4315f52000-05-09 19:53:39 +0000744 else if (strcmp(encoding, "latin-1") == 0)
745 return PyUnicode_AsLatin1String(unicode);
Mark Hammond0ccda1e2003-07-01 00:13:27 +0000746#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
747 else if (strcmp(encoding, "mbcs") == 0)
748 return PyUnicode_AsMBCSString(unicode);
749#endif
Fred Drakee4315f52000-05-09 19:53:39 +0000750 else if (strcmp(encoding, "ascii") == 0)
751 return PyUnicode_AsASCIIString(unicode);
752 }
Guido van Rossumd57fd912000-03-10 22:53:23 +0000753
754 /* Encode via the codec registry */
755 v = PyCodec_Encode(unicode, encoding, errors);
756 if (v == NULL)
757 goto onError;
Guido van Rossumf15a29f2007-05-04 00:41:39 +0000758 if (!PyBytes_Check(v)) {
759 if (PyString_Check(v)) {
760 /* Old codec, turn it into bytes */
761 PyObject *b = PyBytes_FromObject(v);
762 Py_DECREF(v);
763 return b;
764 }
Guido van Rossumd57fd912000-03-10 22:53:23 +0000765 PyErr_Format(PyExc_TypeError,
Guido van Rossumf15a29f2007-05-04 00:41:39 +0000766 "encoder did not return a bytes object "
767 "(type=%.400s, encoding=%.20s, errors=%.20s)",
768 v->ob_type->tp_name,
769 encoding ? encoding : "NULL",
770 errors ? errors : "NULL");
Guido van Rossumd57fd912000-03-10 22:53:23 +0000771 Py_DECREF(v);
772 goto onError;
773 }
774 return v;
Tim Petersced69f82003-09-16 20:30:58 +0000775
Guido van Rossumd57fd912000-03-10 22:53:23 +0000776 onError:
777 return NULL;
778}
779
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000780PyObject *_PyUnicode_AsDefaultEncodedString(PyObject *unicode,
781 const char *errors)
782{
783 PyObject *v = ((PyUnicodeObject *)unicode)->defenc;
Guido van Rossumf15a29f2007-05-04 00:41:39 +0000784 PyObject *b;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000785 if (v)
786 return v;
Guido van Rossumf15a29f2007-05-04 00:41:39 +0000787 if (errors != NULL)
788 Py_FatalError("non-NULL encoding in _PyUnicode_AsDefaultEncodedString");
789 if (errors == NULL) {
790 b = PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
791 PyUnicode_GET_SIZE(unicode),
792 NULL);
793 }
794 else {
795 b = PyUnicode_AsEncodedString(unicode, NULL, errors);
796 }
797 if (!b)
798 return NULL;
799 v = PyString_FromStringAndSize(PyBytes_AsString(b),
800 PyBytes_Size(b));
801 Py_DECREF(b);
802 if (!errors) {
803 Py_XINCREF(v);
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000804 ((PyUnicodeObject *)unicode)->defenc = v;
Guido van Rossumf15a29f2007-05-04 00:41:39 +0000805 }
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000806 return v;
807}
808
Guido van Rossumd57fd912000-03-10 22:53:23 +0000809Py_UNICODE *PyUnicode_AsUnicode(PyObject *unicode)
810{
811 if (!PyUnicode_Check(unicode)) {
812 PyErr_BadArgument();
813 goto onError;
814 }
815 return PyUnicode_AS_UNICODE(unicode);
816
817 onError:
818 return NULL;
819}
820
Martin v. Löwis18e16552006-02-15 17:27:45 +0000821Py_ssize_t PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000822{
823 if (!PyUnicode_Check(unicode)) {
824 PyErr_BadArgument();
825 goto onError;
826 }
827 return PyUnicode_GET_SIZE(unicode);
828
829 onError:
830 return -1;
831}
832
Thomas Wouters78890102000-07-22 19:25:51 +0000833const char *PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +0000834{
835 return unicode_default_encoding;
836}
837
838int PyUnicode_SetDefaultEncoding(const char *encoding)
839{
Guido van Rossumf15a29f2007-05-04 00:41:39 +0000840 if (strcmp(encoding, unicode_default_encoding) != 0) {
841 PyErr_Format(PyExc_ValueError,
842 "Can only set default encoding to %s",
843 unicode_default_encoding);
844 return -1;
845 }
Fred Drakee4315f52000-05-09 19:53:39 +0000846 return 0;
Fred Drakee4315f52000-05-09 19:53:39 +0000847}
848
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000849/* error handling callback helper:
850 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +0000851 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000852 and adjust various state variables.
853 return 0 on success, -1 on error
854*/
855
856static
857int unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
858 const char *encoding, const char *reason,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000859 const char *input, Py_ssize_t insize, Py_ssize_t *startinpos, Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
860 PyObject **output, Py_ssize_t *outpos, Py_UNICODE **outptr)
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000861{
Martin v. Löwis18e16552006-02-15 17:27:45 +0000862 static char *argparse = "O!n;decoding error handler must return (unicode, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000863
864 PyObject *restuple = NULL;
865 PyObject *repunicode = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +0000866 Py_ssize_t outsize = PyUnicode_GET_SIZE(*output);
867 Py_ssize_t requiredsize;
868 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000869 Py_UNICODE *repptr;
Martin v. Löwis18e16552006-02-15 17:27:45 +0000870 Py_ssize_t repsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000871 int res = -1;
872
873 if (*errorHandler == NULL) {
874 *errorHandler = PyCodec_LookupError(errors);
875 if (*errorHandler == NULL)
876 goto onError;
877 }
878
879 if (*exceptionObject == NULL) {
880 *exceptionObject = PyUnicodeDecodeError_Create(
881 encoding, input, insize, *startinpos, *endinpos, reason);
882 if (*exceptionObject == NULL)
883 goto onError;
884 }
885 else {
886 if (PyUnicodeDecodeError_SetStart(*exceptionObject, *startinpos))
887 goto onError;
888 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, *endinpos))
889 goto onError;
890 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
891 goto onError;
892 }
893
894 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
895 if (restuple == NULL)
896 goto onError;
897 if (!PyTuple_Check(restuple)) {
898 PyErr_Format(PyExc_TypeError, &argparse[4]);
899 goto onError;
900 }
901 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
902 goto onError;
903 if (newpos<0)
Walter Dörwald2e0b18a2003-01-31 17:19:08 +0000904 newpos = insize+newpos;
905 if (newpos<0 || newpos>insize) {
Martin v. Löwis2c95cc62006-02-16 06:54:25 +0000906 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
Walter Dörwald2e0b18a2003-01-31 17:19:08 +0000907 goto onError;
908 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000909
910 /* need more space? (at least enough for what we
911 have+the replacement+the rest of the string (starting
912 at the new input position), so we won't have to check space
913 when there are no errors in the rest of the string) */
914 repptr = PyUnicode_AS_UNICODE(repunicode);
915 repsize = PyUnicode_GET_SIZE(repunicode);
916 requiredsize = *outpos + repsize + insize-newpos;
917 if (requiredsize > outsize) {
918 if (requiredsize<2*outsize)
919 requiredsize = 2*outsize;
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +0000920 if (PyUnicode_Resize(output, requiredsize) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000921 goto onError;
922 *outptr = PyUnicode_AS_UNICODE(*output) + *outpos;
923 }
924 *endinpos = newpos;
925 *inptr = input + newpos;
926 Py_UNICODE_COPY(*outptr, repptr, repsize);
927 *outptr += repsize;
928 *outpos += repsize;
929 /* we made it! */
930 res = 0;
931
932 onError:
933 Py_XDECREF(restuple);
934 return res;
935}
936
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000937/* --- UTF-7 Codec -------------------------------------------------------- */
938
939/* see RFC2152 for details */
940
Tim Petersced69f82003-09-16 20:30:58 +0000941static
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000942char utf7_special[128] = {
943 /* indicate whether a UTF-7 character is special i.e. cannot be directly
944 encoded:
945 0 - not special
946 1 - special
947 2 - whitespace (optional)
948 3 - RFC2152 Set O (optional) */
949 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 2, 1, 1,
950 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
951 2, 3, 3, 3, 3, 3, 3, 0, 0, 0, 3, 1, 0, 0, 0, 1,
952 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 0,
953 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
954 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 1, 3, 3, 3,
955 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
956 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 1, 1,
957
958};
959
Marc-André Lemburge115ec82005-10-19 22:33:31 +0000960/* Note: The comparison (c) <= 0 is a trick to work-around gcc
961 warnings about the comparison always being false; since
962 utf7_special[0] is 1, we can safely make that one comparison
963 true */
964
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000965#define SPECIAL(c, encodeO, encodeWS) \
Marc-André Lemburge115ec82005-10-19 22:33:31 +0000966 ((c) > 127 || (c) <= 0 || utf7_special[(c)] == 1 || \
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +0000967 (encodeWS && (utf7_special[(c)] == 2)) || \
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000968 (encodeO && (utf7_special[(c)] == 3)))
969
Marc-André Lemburge115ec82005-10-19 22:33:31 +0000970#define B64(n) \
971 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
972#define B64CHAR(c) \
973 (isalnum(c) || (c) == '+' || (c) == '/')
974#define UB64(c) \
975 ((c) == '+' ? 62 : (c) == '/' ? 63 : (c) >= 'a' ? \
976 (c) - 71 : (c) >= 'A' ? (c) - 65 : (c) + 4 )
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000977
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +0000978#define ENCODE(out, ch, bits) \
979 while (bits >= 6) { \
980 *out++ = B64(ch >> (bits-6)); \
981 bits -= 6; \
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000982 }
983
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +0000984#define DECODE(out, ch, bits, surrogate) \
985 while (bits >= 16) { \
986 Py_UNICODE outCh = (Py_UNICODE) ((ch >> (bits-16)) & 0xffff); \
987 bits -= 16; \
988 if (surrogate) { \
Marc-André Lemburge115ec82005-10-19 22:33:31 +0000989 /* We have already generated an error for the high surrogate \
990 so let's not bother seeing if the low surrogate is correct or not */ \
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +0000991 surrogate = 0; \
992 } else if (0xDC00 <= outCh && outCh <= 0xDFFF) { \
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000993 /* This is a surrogate pair. Unfortunately we can't represent \
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +0000994 it in a 16-bit character */ \
995 surrogate = 1; \
996 errmsg = "code pairs are not supported"; \
997 goto utf7Error; \
998 } else { \
999 *out++ = outCh; \
1000 } \
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001001 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001002
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001003PyObject *PyUnicode_DecodeUTF7(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001004 Py_ssize_t size,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001005 const char *errors)
1006{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001007 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001008 Py_ssize_t startinpos;
1009 Py_ssize_t endinpos;
1010 Py_ssize_t outpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001011 const char *e;
1012 PyUnicodeObject *unicode;
1013 Py_UNICODE *p;
1014 const char *errmsg = "";
1015 int inShift = 0;
1016 unsigned int bitsleft = 0;
1017 unsigned long charsleft = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001018 int surrogate = 0;
1019 PyObject *errorHandler = NULL;
1020 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001021
1022 unicode = _PyUnicode_New(size);
1023 if (!unicode)
1024 return NULL;
1025 if (size == 0)
1026 return (PyObject *)unicode;
1027
1028 p = unicode->str;
1029 e = s + size;
1030
1031 while (s < e) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001032 Py_UNICODE ch;
1033 restart:
1034 ch = *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001035
1036 if (inShift) {
1037 if ((ch == '-') || !B64CHAR(ch)) {
1038 inShift = 0;
1039 s++;
Tim Petersced69f82003-09-16 20:30:58 +00001040
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001041 /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
1042 if (bitsleft >= 6) {
1043 /* The shift sequence has a partial character in it. If
1044 bitsleft < 6 then we could just classify it as padding
1045 but that is not the case here */
1046
1047 errmsg = "partial character in shift sequence";
Tim Petersced69f82003-09-16 20:30:58 +00001048 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001049 }
1050 /* According to RFC2152 the remaining bits should be zero. We
Tim Petersced69f82003-09-16 20:30:58 +00001051 choose to signal an error/insert a replacement character
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001052 here so indicate the potential of a misencoded character. */
1053
1054 /* On x86, a << b == a << (b%32) so make sure that bitsleft != 0 */
1055 if (bitsleft && charsleft << (sizeof(charsleft) * 8 - bitsleft)) {
1056 errmsg = "non-zero padding bits in shift sequence";
Tim Petersced69f82003-09-16 20:30:58 +00001057 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001058 }
1059
1060 if (ch == '-') {
1061 if ((s < e) && (*(s) == '-')) {
Tim Petersced69f82003-09-16 20:30:58 +00001062 *p++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001063 inShift = 1;
1064 }
1065 } else if (SPECIAL(ch,0,0)) {
1066 errmsg = "unexpected special character";
Tim Petersced69f82003-09-16 20:30:58 +00001067 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001068 } else {
1069 *p++ = ch;
1070 }
1071 } else {
1072 charsleft = (charsleft << 6) | UB64(ch);
1073 bitsleft += 6;
1074 s++;
1075 /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
1076 }
1077 }
1078 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001079 startinpos = s-starts;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001080 s++;
1081 if (s < e && *s == '-') {
1082 s++;
1083 *p++ = '+';
1084 } else
1085 {
1086 inShift = 1;
1087 bitsleft = 0;
1088 }
1089 }
1090 else if (SPECIAL(ch,0,0)) {
1091 errmsg = "unexpected special character";
1092 s++;
Tim Petersced69f82003-09-16 20:30:58 +00001093 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001094 }
1095 else {
1096 *p++ = ch;
1097 s++;
1098 }
1099 continue;
1100 utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001101 outpos = p-PyUnicode_AS_UNICODE(unicode);
1102 endinpos = s-starts;
1103 if (unicode_decode_call_errorhandler(
1104 errors, &errorHandler,
1105 "utf7", errmsg,
1106 starts, size, &startinpos, &endinpos, &exc, &s,
1107 (PyObject **)&unicode, &outpos, &p))
1108 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001109 }
1110
1111 if (inShift) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001112 outpos = p-PyUnicode_AS_UNICODE(unicode);
1113 endinpos = size;
1114 if (unicode_decode_call_errorhandler(
1115 errors, &errorHandler,
1116 "utf7", "unterminated shift sequence",
1117 starts, size, &startinpos, &endinpos, &exc, &s,
1118 (PyObject **)&unicode, &outpos, &p))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001119 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001120 if (s < e)
1121 goto restart;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001122 }
1123
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00001124 if (_PyUnicode_Resize(&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001125 goto onError;
1126
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001127 Py_XDECREF(errorHandler);
1128 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001129 return (PyObject *)unicode;
1130
1131onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001132 Py_XDECREF(errorHandler);
1133 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001134 Py_DECREF(unicode);
1135 return NULL;
1136}
1137
1138
1139PyObject *PyUnicode_EncodeUTF7(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001140 Py_ssize_t size,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001141 int encodeSetO,
1142 int encodeWhiteSpace,
1143 const char *errors)
1144{
1145 PyObject *v;
1146 /* It might be possible to tighten this worst case */
Martin v. Löwis18e16552006-02-15 17:27:45 +00001147 Py_ssize_t cbAllocated = 5 * size;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001148 int inShift = 0;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001149 Py_ssize_t i = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001150 unsigned int bitsleft = 0;
1151 unsigned long charsleft = 0;
1152 char * out;
1153 char * start;
1154
1155 if (size == 0)
Walter Dörwald51ab4142007-05-05 14:43:36 +00001156 return PyBytes_FromStringAndSize(NULL, 0);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001157
Walter Dörwald51ab4142007-05-05 14:43:36 +00001158 v = PyBytes_FromStringAndSize(NULL, cbAllocated);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001159 if (v == NULL)
1160 return NULL;
1161
Walter Dörwald51ab4142007-05-05 14:43:36 +00001162 start = out = PyBytes_AS_STRING(v);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001163 for (;i < size; ++i) {
1164 Py_UNICODE ch = s[i];
1165
1166 if (!inShift) {
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001167 if (ch == '+') {
1168 *out++ = '+';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001169 *out++ = '-';
1170 } else if (SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
1171 charsleft = ch;
1172 bitsleft = 16;
1173 *out++ = '+';
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001174 /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001175 inShift = bitsleft > 0;
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001176 } else {
1177 *out++ = (char) ch;
1178 }
1179 } else {
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001180 if (!SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
1181 *out++ = B64(charsleft << (6-bitsleft));
1182 charsleft = 0;
1183 bitsleft = 0;
1184 /* Characters not in the BASE64 set implicitly unshift the sequence
1185 so no '-' is required, except if the character is itself a '-' */
1186 if (B64CHAR(ch) || ch == '-') {
1187 *out++ = '-';
1188 }
1189 inShift = 0;
1190 *out++ = (char) ch;
1191 } else {
1192 bitsleft += 16;
1193 charsleft = (charsleft << 16) | ch;
1194 /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
1195
1196 /* If the next character is special then we dont' need to terminate
Tim Petersced69f82003-09-16 20:30:58 +00001197 the shift sequence. If the next character is not a BASE64 character
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001198 or '-' then the shift sequence will be terminated implicitly and we
1199 don't have to insert a '-'. */
1200
1201 if (bitsleft == 0) {
1202 if (i + 1 < size) {
1203 Py_UNICODE ch2 = s[i+1];
1204
1205 if (SPECIAL(ch2, encodeSetO, encodeWhiteSpace)) {
Tim Petersced69f82003-09-16 20:30:58 +00001206
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001207 } else if (B64CHAR(ch2) || ch2 == '-') {
1208 *out++ = '-';
1209 inShift = 0;
1210 } else {
1211 inShift = 0;
1212 }
1213
1214 }
1215 else {
1216 *out++ = '-';
1217 inShift = 0;
1218 }
1219 }
Tim Petersced69f82003-09-16 20:30:58 +00001220 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001221 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001222 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001223 if (bitsleft) {
1224 *out++= B64(charsleft << (6-bitsleft) );
1225 *out++ = '-';
1226 }
1227
Walter Dörwald51ab4142007-05-05 14:43:36 +00001228 if (PyBytes_Resize(v, out - start)) {
1229 Py_DECREF(v);
1230 return NULL;
1231 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001232 return v;
1233}
1234
1235#undef SPECIAL
1236#undef B64
1237#undef B64CHAR
1238#undef UB64
1239#undef ENCODE
1240#undef DECODE
1241
Guido van Rossumd57fd912000-03-10 22:53:23 +00001242/* --- UTF-8 Codec -------------------------------------------------------- */
1243
Tim Petersced69f82003-09-16 20:30:58 +00001244static
Guido van Rossumd57fd912000-03-10 22:53:23 +00001245char utf8_code_length[256] = {
1246 /* Map UTF-8 encoded prefix byte to sequence length. zero means
1247 illegal prefix. see RFC 2279 for details */
1248 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1249 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1250 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1251 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1252 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1253 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1254 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1255 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1256 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1257 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1258 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1259 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1260 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1261 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1262 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
1263 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0
1264};
1265
Guido van Rossumd57fd912000-03-10 22:53:23 +00001266PyObject *PyUnicode_DecodeUTF8(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001267 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00001268 const char *errors)
1269{
Walter Dörwald69652032004-09-07 20:24:22 +00001270 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
1271}
1272
1273PyObject *PyUnicode_DecodeUTF8Stateful(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001274 Py_ssize_t size,
Walter Dörwald69652032004-09-07 20:24:22 +00001275 const char *errors,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001276 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00001277{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001278 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001279 int n;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001280 Py_ssize_t startinpos;
1281 Py_ssize_t endinpos;
1282 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001283 const char *e;
1284 PyUnicodeObject *unicode;
1285 Py_UNICODE *p;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001286 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001287 PyObject *errorHandler = NULL;
1288 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001289
1290 /* Note: size will always be longer than the resulting Unicode
1291 character count */
1292 unicode = _PyUnicode_New(size);
1293 if (!unicode)
1294 return NULL;
Walter Dörwald69652032004-09-07 20:24:22 +00001295 if (size == 0) {
1296 if (consumed)
1297 *consumed = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001298 return (PyObject *)unicode;
Walter Dörwald69652032004-09-07 20:24:22 +00001299 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001300
1301 /* Unpack UTF-8 encoded data */
1302 p = unicode->str;
1303 e = s + size;
1304
1305 while (s < e) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001306 Py_UCS4 ch = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001307
1308 if (ch < 0x80) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001309 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001310 s++;
1311 continue;
1312 }
1313
1314 n = utf8_code_length[ch];
1315
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001316 if (s + n > e) {
Walter Dörwald69652032004-09-07 20:24:22 +00001317 if (consumed)
1318 break;
1319 else {
1320 errmsg = "unexpected end of data";
1321 startinpos = s-starts;
1322 endinpos = size;
1323 goto utf8Error;
1324 }
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001325 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001326
1327 switch (n) {
1328
1329 case 0:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001330 errmsg = "unexpected code byte";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001331 startinpos = s-starts;
1332 endinpos = startinpos+1;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001333 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001334
1335 case 1:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001336 errmsg = "internal error";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001337 startinpos = s-starts;
1338 endinpos = startinpos+1;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001339 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001340
1341 case 2:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001342 if ((s[1] & 0xc0) != 0x80) {
1343 errmsg = "invalid data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001344 startinpos = s-starts;
1345 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001346 goto utf8Error;
1347 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001348 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001349 if (ch < 0x80) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001350 startinpos = s-starts;
1351 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001352 errmsg = "illegal encoding";
1353 goto utf8Error;
1354 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001355 else
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001356 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001357 break;
1358
1359 case 3:
Tim Petersced69f82003-09-16 20:30:58 +00001360 if ((s[1] & 0xc0) != 0x80 ||
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001361 (s[2] & 0xc0) != 0x80) {
1362 errmsg = "invalid data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001363 startinpos = s-starts;
1364 endinpos = startinpos+3;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001365 goto utf8Error;
1366 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001367 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001368 if (ch < 0x0800) {
1369 /* Note: UTF-8 encodings of surrogates are considered
Tim Petersced69f82003-09-16 20:30:58 +00001370 legal UTF-8 sequences;
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001371
1372 XXX For wide builds (UCS-4) we should probably try
1373 to recombine the surrogates into a single code
1374 unit.
1375 */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001376 errmsg = "illegal encoding";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001377 startinpos = s-starts;
1378 endinpos = startinpos+3;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001379 goto utf8Error;
1380 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001381 else
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001382 *p++ = (Py_UNICODE)ch;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001383 break;
1384
1385 case 4:
1386 if ((s[1] & 0xc0) != 0x80 ||
1387 (s[2] & 0xc0) != 0x80 ||
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001388 (s[3] & 0xc0) != 0x80) {
1389 errmsg = "invalid data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001390 startinpos = s-starts;
1391 endinpos = startinpos+4;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001392 goto utf8Error;
1393 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001394 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
1395 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
1396 /* validate and convert to UTF-16 */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001397 if ((ch < 0x10000) /* minimum value allowed for 4
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001398 byte encoding */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001399 || (ch > 0x10ffff)) /* maximum value allowed for
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001400 UTF-16 */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001401 {
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001402 errmsg = "illegal encoding";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001403 startinpos = s-starts;
1404 endinpos = startinpos+4;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001405 goto utf8Error;
1406 }
Fredrik Lundh8f455852001-06-27 18:59:43 +00001407#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001408 *p++ = (Py_UNICODE)ch;
1409#else
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001410 /* compute and append the two surrogates: */
Tim Petersced69f82003-09-16 20:30:58 +00001411
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001412 /* translate from 10000..10FFFF to 0..FFFF */
1413 ch -= 0x10000;
Tim Petersced69f82003-09-16 20:30:58 +00001414
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001415 /* high surrogate = top 10 bits added to D800 */
1416 *p++ = (Py_UNICODE)(0xD800 + (ch >> 10));
Tim Petersced69f82003-09-16 20:30:58 +00001417
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001418 /* low surrogate = bottom 10 bits added to DC00 */
Fredrik Lundh45714e92001-06-26 16:39:36 +00001419 *p++ = (Py_UNICODE)(0xDC00 + (ch & 0x03FF));
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001420#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00001421 break;
1422
1423 default:
1424 /* Other sizes are only needed for UCS-4 */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001425 errmsg = "unsupported Unicode code range";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001426 startinpos = s-starts;
1427 endinpos = startinpos+n;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001428 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001429 }
1430 s += n;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001431 continue;
Tim Petersced69f82003-09-16 20:30:58 +00001432
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001433 utf8Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001434 outpos = p-PyUnicode_AS_UNICODE(unicode);
1435 if (unicode_decode_call_errorhandler(
1436 errors, &errorHandler,
1437 "utf8", errmsg,
1438 starts, size, &startinpos, &endinpos, &exc, &s,
1439 (PyObject **)&unicode, &outpos, &p))
1440 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001441 }
Walter Dörwald69652032004-09-07 20:24:22 +00001442 if (consumed)
1443 *consumed = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001444
1445 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00001446 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001447 goto onError;
1448
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001449 Py_XDECREF(errorHandler);
1450 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001451 return (PyObject *)unicode;
1452
1453onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001454 Py_XDECREF(errorHandler);
1455 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001456 Py_DECREF(unicode);
1457 return NULL;
1458}
1459
Tim Peters602f7402002-04-27 18:03:26 +00001460/* Allocation strategy: if the string is short, convert into a stack buffer
1461 and allocate exactly as much space needed at the end. Else allocate the
1462 maximum possible needed (4 result bytes per Unicode character), and return
1463 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001464*/
Tim Peters7e3d9612002-04-21 03:26:37 +00001465PyObject *
1466PyUnicode_EncodeUTF8(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001467 Py_ssize_t size,
Tim Peters7e3d9612002-04-21 03:26:37 +00001468 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001469{
Tim Peters602f7402002-04-27 18:03:26 +00001470#define MAX_SHORT_UNICHARS 300 /* largest size we'll do on the stack */
Tim Peters0eca65c2002-04-21 17:28:06 +00001471
Martin v. Löwis18e16552006-02-15 17:27:45 +00001472 Py_ssize_t i; /* index into s of next input byte */
Tim Peters602f7402002-04-27 18:03:26 +00001473 PyObject *v; /* result string object */
1474 char *p; /* next free byte in output buffer */
Martin v. Löwis18e16552006-02-15 17:27:45 +00001475 Py_ssize_t nallocated; /* number of result bytes allocated */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001476 Py_ssize_t nneeded; /* number of result bytes needed */
Tim Peters602f7402002-04-27 18:03:26 +00001477 char stackbuf[MAX_SHORT_UNICHARS * 4];
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001478
Tim Peters602f7402002-04-27 18:03:26 +00001479 assert(s != NULL);
1480 assert(size >= 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001481
Tim Peters602f7402002-04-27 18:03:26 +00001482 if (size <= MAX_SHORT_UNICHARS) {
1483 /* Write into the stack buffer; nallocated can't overflow.
1484 * At the end, we'll allocate exactly as much heap space as it
1485 * turns out we need.
1486 */
1487 nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int);
1488 v = NULL; /* will allocate after we're done */
1489 p = stackbuf;
1490 }
1491 else {
1492 /* Overallocate on the heap, and give the excess back at the end. */
1493 nallocated = size * 4;
1494 if (nallocated / 4 != size) /* overflow! */
1495 return PyErr_NoMemory();
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001496 v = PyBytes_FromStringAndSize(NULL, nallocated);
Tim Peters602f7402002-04-27 18:03:26 +00001497 if (v == NULL)
1498 return NULL;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001499 p = PyBytes_AS_STRING(v);
Tim Peters602f7402002-04-27 18:03:26 +00001500 }
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001501
Tim Peters602f7402002-04-27 18:03:26 +00001502 for (i = 0; i < size;) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001503 Py_UCS4 ch = s[i++];
Marc-André Lemburg3688a882002-02-06 18:09:02 +00001504
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001505 if (ch < 0x80)
Tim Peters602f7402002-04-27 18:03:26 +00001506 /* Encode ASCII */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001507 *p++ = (char) ch;
Marc-André Lemburg3688a882002-02-06 18:09:02 +00001508
Guido van Rossumd57fd912000-03-10 22:53:23 +00001509 else if (ch < 0x0800) {
Tim Peters602f7402002-04-27 18:03:26 +00001510 /* Encode Latin-1 */
Marc-André Lemburgdc724d62002-02-06 18:20:19 +00001511 *p++ = (char)(0xc0 | (ch >> 6));
1512 *p++ = (char)(0x80 | (ch & 0x3f));
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001513 }
Marc-André Lemburg3688a882002-02-06 18:09:02 +00001514 else {
Tim Peters602f7402002-04-27 18:03:26 +00001515 /* Encode UCS2 Unicode ordinals */
1516 if (ch < 0x10000) {
1517 /* Special case: check for high surrogate */
1518 if (0xD800 <= ch && ch <= 0xDBFF && i != size) {
1519 Py_UCS4 ch2 = s[i];
1520 /* Check for low surrogate and combine the two to
1521 form a UCS4 value */
1522 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001523 ch = ((ch - 0xD800) << 10 | (ch2 - 0xDC00)) + 0x10000;
Tim Peters602f7402002-04-27 18:03:26 +00001524 i++;
1525 goto encodeUCS4;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001526 }
Tim Peters602f7402002-04-27 18:03:26 +00001527 /* Fall through: handles isolated high surrogates */
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001528 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001529 *p++ = (char)(0xe0 | (ch >> 12));
Tim Peters602f7402002-04-27 18:03:26 +00001530 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
1531 *p++ = (char)(0x80 | (ch & 0x3f));
1532 continue;
1533 }
1534encodeUCS4:
1535 /* Encode UCS4 Unicode ordinals */
1536 *p++ = (char)(0xf0 | (ch >> 18));
1537 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
1538 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
1539 *p++ = (char)(0x80 | (ch & 0x3f));
1540 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001541 }
Tim Peters0eca65c2002-04-21 17:28:06 +00001542
Tim Peters602f7402002-04-27 18:03:26 +00001543 if (v == NULL) {
1544 /* This was stack allocated. */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001545 nneeded = p - stackbuf;
Tim Peters602f7402002-04-27 18:03:26 +00001546 assert(nneeded <= nallocated);
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001547 v = PyBytes_FromStringAndSize(stackbuf, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00001548 }
1549 else {
1550 /* Cut back to size actually needed. */
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001551 nneeded = p - PyBytes_AS_STRING(v);
Tim Peters602f7402002-04-27 18:03:26 +00001552 assert(nneeded <= nallocated);
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001553 PyBytes_Resize(v, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00001554 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001555 return v;
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001556
Tim Peters602f7402002-04-27 18:03:26 +00001557#undef MAX_SHORT_UNICHARS
Guido van Rossumd57fd912000-03-10 22:53:23 +00001558}
1559
Guido van Rossumd57fd912000-03-10 22:53:23 +00001560PyObject *PyUnicode_AsUTF8String(PyObject *unicode)
1561{
Guido van Rossumd57fd912000-03-10 22:53:23 +00001562 if (!PyUnicode_Check(unicode)) {
1563 PyErr_BadArgument();
1564 return NULL;
1565 }
Barry Warsaw2dd4abf2000-08-18 06:58:15 +00001566 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
1567 PyUnicode_GET_SIZE(unicode),
1568 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001569}
1570
1571/* --- UTF-16 Codec ------------------------------------------------------- */
1572
Tim Peters772747b2001-08-09 22:21:55 +00001573PyObject *
1574PyUnicode_DecodeUTF16(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001575 Py_ssize_t size,
Tim Peters772747b2001-08-09 22:21:55 +00001576 const char *errors,
1577 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001578{
Walter Dörwald69652032004-09-07 20:24:22 +00001579 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
1580}
1581
1582PyObject *
1583PyUnicode_DecodeUTF16Stateful(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001584 Py_ssize_t size,
Walter Dörwald69652032004-09-07 20:24:22 +00001585 const char *errors,
1586 int *byteorder,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001587 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00001588{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001589 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001590 Py_ssize_t startinpos;
1591 Py_ssize_t endinpos;
1592 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001593 PyUnicodeObject *unicode;
1594 Py_UNICODE *p;
Tim Peters772747b2001-08-09 22:21:55 +00001595 const unsigned char *q, *e;
1596 int bo = 0; /* assume native ordering by default */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001597 const char *errmsg = "";
Tim Peters772747b2001-08-09 22:21:55 +00001598 /* Offsets from q for retrieving byte pairs in the right order. */
1599#ifdef BYTEORDER_IS_LITTLE_ENDIAN
1600 int ihi = 1, ilo = 0;
1601#else
1602 int ihi = 0, ilo = 1;
1603#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001604 PyObject *errorHandler = NULL;
1605 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001606
1607 /* Note: size will always be longer than the resulting Unicode
1608 character count */
1609 unicode = _PyUnicode_New(size);
1610 if (!unicode)
1611 return NULL;
1612 if (size == 0)
1613 return (PyObject *)unicode;
1614
1615 /* Unpack UTF-16 encoded data */
1616 p = unicode->str;
Tim Peters772747b2001-08-09 22:21:55 +00001617 q = (unsigned char *)s;
1618 e = q + size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001619
1620 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00001621 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001622
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001623 /* Check for BOM marks (U+FEFF) in the input and adjust current
1624 byte order setting accordingly. In native mode, the leading BOM
1625 mark is skipped, in all other modes, it is copied to the output
1626 stream as-is (giving a ZWNBSP character). */
1627 if (bo == 0) {
Walter Dörwald69652032004-09-07 20:24:22 +00001628 if (size >= 2) {
1629 const Py_UNICODE bom = (q[ihi] << 8) | q[ilo];
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001630#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Walter Dörwald69652032004-09-07 20:24:22 +00001631 if (bom == 0xFEFF) {
1632 q += 2;
1633 bo = -1;
1634 }
1635 else if (bom == 0xFFFE) {
1636 q += 2;
1637 bo = 1;
1638 }
Tim Petersced69f82003-09-16 20:30:58 +00001639#else
Walter Dörwald69652032004-09-07 20:24:22 +00001640 if (bom == 0xFEFF) {
1641 q += 2;
1642 bo = 1;
1643 }
1644 else if (bom == 0xFFFE) {
1645 q += 2;
1646 bo = -1;
1647 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001648#endif
Walter Dörwald69652032004-09-07 20:24:22 +00001649 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001650 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001651
Tim Peters772747b2001-08-09 22:21:55 +00001652 if (bo == -1) {
1653 /* force LE */
1654 ihi = 1;
1655 ilo = 0;
1656 }
1657 else if (bo == 1) {
1658 /* force BE */
1659 ihi = 0;
1660 ilo = 1;
1661 }
1662
1663 while (q < e) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001664 Py_UNICODE ch;
Walter Dörwald69652032004-09-07 20:24:22 +00001665 /* remaining bytes at the end? (size should be even) */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001666 if (e-q<2) {
Walter Dörwald69652032004-09-07 20:24:22 +00001667 if (consumed)
1668 break;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001669 errmsg = "truncated data";
1670 startinpos = ((const char *)q)-starts;
1671 endinpos = ((const char *)e)-starts;
1672 goto utf16Error;
1673 /* The remaining input chars are ignored if the callback
1674 chooses to skip the input */
1675 }
1676 ch = (q[ihi] << 8) | q[ilo];
1677
Tim Peters772747b2001-08-09 22:21:55 +00001678 q += 2;
1679
Guido van Rossumd57fd912000-03-10 22:53:23 +00001680 if (ch < 0xD800 || ch > 0xDFFF) {
1681 *p++ = ch;
1682 continue;
1683 }
1684
1685 /* UTF-16 code pair: */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001686 if (q >= e) {
1687 errmsg = "unexpected end of data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001688 startinpos = (((const char *)q)-2)-starts;
1689 endinpos = ((const char *)e)-starts;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001690 goto utf16Error;
1691 }
Martin v. Löwisac93bc22001-06-26 22:43:40 +00001692 if (0xD800 <= ch && ch <= 0xDBFF) {
Tim Peters772747b2001-08-09 22:21:55 +00001693 Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo];
1694 q += 2;
Martin v. Löwisac93bc22001-06-26 22:43:40 +00001695 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Fredrik Lundh8f455852001-06-27 18:59:43 +00001696#ifndef Py_UNICODE_WIDE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001697 *p++ = ch;
1698 *p++ = ch2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001699#else
1700 *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001701#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001702 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001703 }
1704 else {
1705 errmsg = "illegal UTF-16 surrogate";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001706 startinpos = (((const char *)q)-4)-starts;
1707 endinpos = startinpos+2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001708 goto utf16Error;
1709 }
1710
Guido van Rossumd57fd912000-03-10 22:53:23 +00001711 }
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001712 errmsg = "illegal encoding";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001713 startinpos = (((const char *)q)-2)-starts;
1714 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001715 /* Fall through to report the error */
1716
1717 utf16Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001718 outpos = p-PyUnicode_AS_UNICODE(unicode);
1719 if (unicode_decode_call_errorhandler(
1720 errors, &errorHandler,
1721 "utf16", errmsg,
1722 starts, size, &startinpos, &endinpos, &exc, (const char **)&q,
1723 (PyObject **)&unicode, &outpos, &p))
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001724 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001725 }
1726
1727 if (byteorder)
1728 *byteorder = bo;
1729
Walter Dörwald69652032004-09-07 20:24:22 +00001730 if (consumed)
1731 *consumed = (const char *)q-starts;
1732
Guido van Rossumd57fd912000-03-10 22:53:23 +00001733 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00001734 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001735 goto onError;
1736
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001737 Py_XDECREF(errorHandler);
1738 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001739 return (PyObject *)unicode;
1740
1741onError:
1742 Py_DECREF(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001743 Py_XDECREF(errorHandler);
1744 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001745 return NULL;
1746}
1747
Tim Peters772747b2001-08-09 22:21:55 +00001748PyObject *
1749PyUnicode_EncodeUTF16(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001750 Py_ssize_t size,
Tim Peters772747b2001-08-09 22:21:55 +00001751 const char *errors,
1752 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001753{
1754 PyObject *v;
Tim Peters772747b2001-08-09 22:21:55 +00001755 unsigned char *p;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00001756#ifdef Py_UNICODE_WIDE
Tim Peters772747b2001-08-09 22:21:55 +00001757 int i, pairs;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00001758#else
1759 const int pairs = 0;
1760#endif
Tim Peters772747b2001-08-09 22:21:55 +00001761 /* Offsets from p for storing byte pairs in the right order. */
1762#ifdef BYTEORDER_IS_LITTLE_ENDIAN
1763 int ihi = 1, ilo = 0;
1764#else
1765 int ihi = 0, ilo = 1;
1766#endif
1767
1768#define STORECHAR(CH) \
1769 do { \
1770 p[ihi] = ((CH) >> 8) & 0xff; \
1771 p[ilo] = (CH) & 0xff; \
1772 p += 2; \
1773 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001774
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00001775#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001776 for (i = pairs = 0; i < size; i++)
1777 if (s[i] >= 0x10000)
1778 pairs++;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00001779#endif
Walter Dörwald3cc34522007-05-04 10:48:27 +00001780 v = PyBytes_FromStringAndSize(NULL,
Tim Peters772747b2001-08-09 22:21:55 +00001781 2 * (size + pairs + (byteorder == 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +00001782 if (v == NULL)
1783 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001784
Walter Dörwald3cc34522007-05-04 10:48:27 +00001785 p = (unsigned char *)PyBytes_AS_STRING(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001786 if (byteorder == 0)
Tim Peters772747b2001-08-09 22:21:55 +00001787 STORECHAR(0xFEFF);
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00001788 if (size == 0)
Marc-André Lemburgb7520772000-08-14 11:29:19 +00001789 return v;
Tim Peters772747b2001-08-09 22:21:55 +00001790
1791 if (byteorder == -1) {
1792 /* force LE */
1793 ihi = 1;
1794 ilo = 0;
1795 }
1796 else if (byteorder == 1) {
1797 /* force BE */
1798 ihi = 0;
1799 ilo = 1;
1800 }
1801
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001802 while (size-- > 0) {
1803 Py_UNICODE ch = *s++;
1804 Py_UNICODE ch2 = 0;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00001805#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001806 if (ch >= 0x10000) {
Tim Peters772747b2001-08-09 22:21:55 +00001807 ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF);
1808 ch = 0xD800 | ((ch-0x10000) >> 10);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001809 }
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00001810#endif
Tim Peters772747b2001-08-09 22:21:55 +00001811 STORECHAR(ch);
1812 if (ch2)
1813 STORECHAR(ch2);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001814 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001815 return v;
Tim Peters772747b2001-08-09 22:21:55 +00001816#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00001817}
1818
1819PyObject *PyUnicode_AsUTF16String(PyObject *unicode)
1820{
1821 if (!PyUnicode_Check(unicode)) {
1822 PyErr_BadArgument();
1823 return NULL;
1824 }
1825 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
1826 PyUnicode_GET_SIZE(unicode),
1827 NULL,
1828 0);
1829}
1830
1831/* --- Unicode Escape Codec ----------------------------------------------- */
1832
Fredrik Lundh06d12682001-01-24 07:59:11 +00001833static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00001834
Guido van Rossumd57fd912000-03-10 22:53:23 +00001835PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001836 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00001837 const char *errors)
1838{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001839 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001840 Py_ssize_t startinpos;
1841 Py_ssize_t endinpos;
1842 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001843 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001844 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001845 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001846 const char *end;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001847 char* message;
1848 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001849 PyObject *errorHandler = NULL;
1850 PyObject *exc = NULL;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001851
Guido van Rossumd57fd912000-03-10 22:53:23 +00001852 /* Escaped strings will always be longer than the resulting
1853 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001854 length after conversion to the true value.
1855 (but if the error callback returns a long replacement string
1856 we'll have to allocate more space) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001857 v = _PyUnicode_New(size);
1858 if (v == NULL)
1859 goto onError;
1860 if (size == 0)
1861 return (PyObject *)v;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001862
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001863 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001864 end = s + size;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001865
Guido van Rossumd57fd912000-03-10 22:53:23 +00001866 while (s < end) {
1867 unsigned char c;
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00001868 Py_UNICODE x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001869 int digits;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001870
1871 /* Non-escape characters are interpreted as Unicode ordinals */
1872 if (*s != '\\') {
Fredrik Lundhccc74732001-02-18 22:13:49 +00001873 *p++ = (unsigned char) *s++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001874 continue;
1875 }
1876
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001877 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001878 /* \ - Escapes */
1879 s++;
1880 switch (*s++) {
1881
1882 /* \x escapes */
1883 case '\n': break;
1884 case '\\': *p++ = '\\'; break;
1885 case '\'': *p++ = '\''; break;
1886 case '\"': *p++ = '\"'; break;
1887 case 'b': *p++ = '\b'; break;
1888 case 'f': *p++ = '\014'; break; /* FF */
1889 case 't': *p++ = '\t'; break;
1890 case 'n': *p++ = '\n'; break;
1891 case 'r': *p++ = '\r'; break;
1892 case 'v': *p++ = '\013'; break; /* VT */
1893 case 'a': *p++ = '\007'; break; /* BEL, not classic C */
1894
1895 /* \OOO (octal) escapes */
1896 case '0': case '1': case '2': case '3':
1897 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001898 x = s[-1] - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00001899 if ('0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001900 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00001901 if ('0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001902 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00001903 }
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001904 *p++ = x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001905 break;
1906
Fredrik Lundhccc74732001-02-18 22:13:49 +00001907 /* hex escapes */
1908 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001909 case 'x':
Fredrik Lundhccc74732001-02-18 22:13:49 +00001910 digits = 2;
1911 message = "truncated \\xXX escape";
1912 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001913
Fredrik Lundhccc74732001-02-18 22:13:49 +00001914 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001915 case 'u':
Fredrik Lundhccc74732001-02-18 22:13:49 +00001916 digits = 4;
1917 message = "truncated \\uXXXX escape";
1918 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001919
Fredrik Lundhccc74732001-02-18 22:13:49 +00001920 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00001921 case 'U':
Fredrik Lundhccc74732001-02-18 22:13:49 +00001922 digits = 8;
1923 message = "truncated \\UXXXXXXXX escape";
1924 hexescape:
1925 chr = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001926 outpos = p-PyUnicode_AS_UNICODE(v);
1927 if (s+digits>end) {
1928 endinpos = size;
1929 if (unicode_decode_call_errorhandler(
1930 errors, &errorHandler,
1931 "unicodeescape", "end of string in escape sequence",
1932 starts, size, &startinpos, &endinpos, &exc, &s,
1933 (PyObject **)&v, &outpos, &p))
1934 goto onError;
1935 goto nextByte;
1936 }
1937 for (i = 0; i < digits; ++i) {
Fredrik Lundhccc74732001-02-18 22:13:49 +00001938 c = (unsigned char) s[i];
Fredrik Lundhdf846752000-09-03 11:29:49 +00001939 if (!isxdigit(c)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001940 endinpos = (s+i+1)-starts;
1941 if (unicode_decode_call_errorhandler(
1942 errors, &errorHandler,
1943 "unicodeescape", message,
1944 starts, size, &startinpos, &endinpos, &exc, &s,
1945 (PyObject **)&v, &outpos, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00001946 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001947 goto nextByte;
Fredrik Lundhdf846752000-09-03 11:29:49 +00001948 }
1949 chr = (chr<<4) & ~0xF;
1950 if (c >= '0' && c <= '9')
1951 chr += c - '0';
1952 else if (c >= 'a' && c <= 'f')
1953 chr += 10 + c - 'a';
1954 else
1955 chr += 10 + c - 'A';
1956 }
1957 s += i;
Jeremy Hylton504de6b2003-10-06 05:08:26 +00001958 if (chr == 0xffffffff && PyErr_Occurred())
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001959 /* _decoding_error will have already written into the
1960 target buffer. */
1961 break;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001962 store:
Fredrik Lundhdf846752000-09-03 11:29:49 +00001963 /* when we get here, chr is a 32-bit unicode character */
1964 if (chr <= 0xffff)
1965 /* UCS-2 character */
1966 *p++ = (Py_UNICODE) chr;
1967 else if (chr <= 0x10ffff) {
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001968 /* UCS-4 character. Either store directly, or as
Walter Dörwald8c077222002-03-25 11:16:18 +00001969 surrogate pair. */
Fredrik Lundh8f455852001-06-27 18:59:43 +00001970#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001971 *p++ = chr;
1972#else
Fredrik Lundhdf846752000-09-03 11:29:49 +00001973 chr -= 0x10000L;
1974 *p++ = 0xD800 + (Py_UNICODE) (chr >> 10);
Fredrik Lundh45714e92001-06-26 16:39:36 +00001975 *p++ = 0xDC00 + (Py_UNICODE) (chr & 0x03FF);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001976#endif
Fredrik Lundhdf846752000-09-03 11:29:49 +00001977 } else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001978 endinpos = s-starts;
1979 outpos = p-PyUnicode_AS_UNICODE(v);
1980 if (unicode_decode_call_errorhandler(
1981 errors, &errorHandler,
1982 "unicodeescape", "illegal Unicode character",
1983 starts, size, &startinpos, &endinpos, &exc, &s,
1984 (PyObject **)&v, &outpos, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00001985 goto onError;
1986 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00001987 break;
1988
1989 /* \N{name} */
1990 case 'N':
1991 message = "malformed \\N character escape";
1992 if (ucnhash_CAPI == NULL) {
1993 /* load the unicode data module */
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00001994 PyObject *m, *api;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001995 m = PyImport_ImportModule("unicodedata");
1996 if (m == NULL)
1997 goto ucnhashError;
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00001998 api = PyObject_GetAttrString(m, "ucnhash_CAPI");
Fredrik Lundhccc74732001-02-18 22:13:49 +00001999 Py_DECREF(m);
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00002000 if (api == NULL)
Fredrik Lundhccc74732001-02-18 22:13:49 +00002001 goto ucnhashError;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002002 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCObject_AsVoidPtr(api);
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00002003 Py_DECREF(api);
Fredrik Lundhccc74732001-02-18 22:13:49 +00002004 if (ucnhash_CAPI == NULL)
2005 goto ucnhashError;
2006 }
2007 if (*s == '{') {
2008 const char *start = s+1;
2009 /* look for the closing brace */
2010 while (*s != '}' && s < end)
2011 s++;
2012 if (s > start && s < end && *s == '}') {
2013 /* found a name. look it up in the unicode database */
2014 message = "unknown Unicode character name";
2015 s++;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +00002016 if (ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1), &chr))
Fredrik Lundhccc74732001-02-18 22:13:49 +00002017 goto store;
2018 }
2019 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002020 endinpos = s-starts;
2021 outpos = p-PyUnicode_AS_UNICODE(v);
2022 if (unicode_decode_call_errorhandler(
2023 errors, &errorHandler,
2024 "unicodeescape", message,
2025 starts, size, &startinpos, &endinpos, &exc, &s,
2026 (PyObject **)&v, &outpos, &p))
Fredrik Lundhccc74732001-02-18 22:13:49 +00002027 goto onError;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002028 break;
2029
2030 default:
Walter Dörwald8c077222002-03-25 11:16:18 +00002031 if (s > end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002032 message = "\\ at end of string";
2033 s--;
2034 endinpos = s-starts;
2035 outpos = p-PyUnicode_AS_UNICODE(v);
2036 if (unicode_decode_call_errorhandler(
2037 errors, &errorHandler,
2038 "unicodeescape", message,
2039 starts, size, &startinpos, &endinpos, &exc, &s,
2040 (PyObject **)&v, &outpos, &p))
Walter Dörwald8c077222002-03-25 11:16:18 +00002041 goto onError;
2042 }
2043 else {
2044 *p++ = '\\';
2045 *p++ = (unsigned char)s[-1];
2046 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00002047 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002048 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002049 nextByte:
2050 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002051 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002052 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002053 goto onError;
Walter Dörwaldd4ade082003-08-15 15:00:26 +00002054 Py_XDECREF(errorHandler);
2055 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002056 return (PyObject *)v;
Walter Dörwald8c077222002-03-25 11:16:18 +00002057
Fredrik Lundhccc74732001-02-18 22:13:49 +00002058ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00002059 PyErr_SetString(
2060 PyExc_UnicodeError,
2061 "\\N escapes not supported (can't load unicodedata module)"
2062 );
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00002063 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002064 Py_XDECREF(errorHandler);
2065 Py_XDECREF(exc);
Fredrik Lundhf6056062001-01-20 11:15:25 +00002066 return NULL;
2067
Fredrik Lundhccc74732001-02-18 22:13:49 +00002068onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00002069 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002070 Py_XDECREF(errorHandler);
2071 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002072 return NULL;
2073}
2074
2075/* Return a Unicode-Escape string version of the Unicode object.
2076
2077 If quotes is true, the string is enclosed in u"" or u'' quotes as
2078 appropriate.
2079
2080*/
2081
Thomas Wouters477c8d52006-05-27 19:21:47 +00002082Py_LOCAL_INLINE(const Py_UNICODE *) findchar(const Py_UNICODE *s,
2083 Py_ssize_t size,
2084 Py_UNICODE ch)
2085{
2086 /* like wcschr, but doesn't stop at NULL characters */
2087
2088 while (size-- > 0) {
2089 if (*s == ch)
2090 return s;
2091 s++;
2092 }
2093
2094 return NULL;
2095}
Barry Warsaw51ac5802000-03-20 16:36:48 +00002096
Guido van Rossumd57fd912000-03-10 22:53:23 +00002097static
2098PyObject *unicodeescape_string(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002099 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00002100 int quotes)
2101{
2102 PyObject *repr;
2103 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002104
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00002105 static const char *hexdigit = "0123456789abcdef";
Guido van Rossumd57fd912000-03-10 22:53:23 +00002106
Thomas Wouters89f507f2006-12-13 04:49:30 +00002107 /* XXX(nnorwitz): rather than over-allocating, it would be
2108 better to choose a different scheme. Perhaps scan the
2109 first N-chars of the string and allocate based on that size.
2110 */
2111 /* Initial allocation is based on the longest-possible unichr
2112 escape.
2113
2114 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
2115 unichr, so in this case it's the longest unichr escape. In
2116 narrow (UTF-16) builds this is five chars per source unichr
2117 since there are two unichrs in the surrogate pair, so in narrow
2118 (UTF-16) builds it's not the longest unichr escape.
2119
2120 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
2121 so in the narrow (UTF-16) build case it's the longest unichr
2122 escape.
2123 */
2124
2125 repr = PyString_FromStringAndSize(NULL,
2126 2
2127#ifdef Py_UNICODE_WIDE
2128 + 10*size
2129#else
2130 + 6*size
2131#endif
2132 + 1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002133 if (repr == NULL)
2134 return NULL;
2135
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002136 p = PyString_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002137
2138 if (quotes) {
Tim Petersced69f82003-09-16 20:30:58 +00002139 *p++ = (findchar(s, size, '\'') &&
Guido van Rossumd57fd912000-03-10 22:53:23 +00002140 !findchar(s, size, '"')) ? '"' : '\'';
2141 }
2142 while (size-- > 0) {
2143 Py_UNICODE ch = *s++;
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002144
Hye-Shik Chang835b2432005-12-17 04:38:31 +00002145 /* Escape quotes and backslashes */
2146 if ((quotes &&
Guido van Rossum572dbf82007-04-27 23:53:51 +00002147 ch == (Py_UNICODE) PyString_AS_STRING(repr)[0]) || ch == '\\') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002148 *p++ = '\\';
2149 *p++ = (char) ch;
Guido van Rossumad9744a2001-09-21 15:38:17 +00002150 continue;
Tim Petersced69f82003-09-16 20:30:58 +00002151 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002152
Guido van Rossum0d42e0c2001-07-20 16:36:21 +00002153#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002154 /* Map 21-bit characters to '\U00xxxxxx' */
2155 else if (ch >= 0x10000) {
2156 *p++ = '\\';
2157 *p++ = 'U';
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002158 *p++ = hexdigit[(ch >> 28) & 0x0000000F];
2159 *p++ = hexdigit[(ch >> 24) & 0x0000000F];
2160 *p++ = hexdigit[(ch >> 20) & 0x0000000F];
2161 *p++ = hexdigit[(ch >> 16) & 0x0000000F];
2162 *p++ = hexdigit[(ch >> 12) & 0x0000000F];
2163 *p++ = hexdigit[(ch >> 8) & 0x0000000F];
2164 *p++ = hexdigit[(ch >> 4) & 0x0000000F];
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002165 *p++ = hexdigit[ch & 0x0000000F];
2166 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002167 }
Thomas Wouters89f507f2006-12-13 04:49:30 +00002168#else
2169 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002170 else if (ch >= 0xD800 && ch < 0xDC00) {
2171 Py_UNICODE ch2;
2172 Py_UCS4 ucs;
Tim Petersced69f82003-09-16 20:30:58 +00002173
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002174 ch2 = *s++;
2175 size--;
2176 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
2177 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
2178 *p++ = '\\';
2179 *p++ = 'U';
2180 *p++ = hexdigit[(ucs >> 28) & 0x0000000F];
2181 *p++ = hexdigit[(ucs >> 24) & 0x0000000F];
2182 *p++ = hexdigit[(ucs >> 20) & 0x0000000F];
2183 *p++ = hexdigit[(ucs >> 16) & 0x0000000F];
2184 *p++ = hexdigit[(ucs >> 12) & 0x0000000F];
2185 *p++ = hexdigit[(ucs >> 8) & 0x0000000F];
2186 *p++ = hexdigit[(ucs >> 4) & 0x0000000F];
2187 *p++ = hexdigit[ucs & 0x0000000F];
2188 continue;
2189 }
2190 /* Fall through: isolated surrogates are copied as-is */
2191 s--;
2192 size++;
2193 }
Thomas Wouters89f507f2006-12-13 04:49:30 +00002194#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002195
Guido van Rossumd57fd912000-03-10 22:53:23 +00002196 /* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002197 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002198 *p++ = '\\';
2199 *p++ = 'u';
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002200 *p++ = hexdigit[(ch >> 12) & 0x000F];
2201 *p++ = hexdigit[(ch >> 8) & 0x000F];
2202 *p++ = hexdigit[(ch >> 4) & 0x000F];
2203 *p++ = hexdigit[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00002204 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002205
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00002206 /* Map special whitespace to '\t', \n', '\r' */
2207 else if (ch == '\t') {
2208 *p++ = '\\';
2209 *p++ = 't';
2210 }
2211 else if (ch == '\n') {
2212 *p++ = '\\';
2213 *p++ = 'n';
2214 }
2215 else if (ch == '\r') {
2216 *p++ = '\\';
2217 *p++ = 'r';
2218 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002219
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00002220 /* Map non-printable US ASCII to '\xhh' */
Marc-André Lemburg11326de2001-11-28 12:56:20 +00002221 else if (ch < ' ' || ch >= 0x7F) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002222 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00002223 *p++ = 'x';
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002224 *p++ = hexdigit[(ch >> 4) & 0x000F];
2225 *p++ = hexdigit[ch & 0x000F];
Tim Petersced69f82003-09-16 20:30:58 +00002226 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002227
Guido van Rossumd57fd912000-03-10 22:53:23 +00002228 /* Copy everything else as-is */
2229 else
2230 *p++ = (char) ch;
2231 }
2232 if (quotes)
Guido van Rossum572dbf82007-04-27 23:53:51 +00002233 *p++ = PyString_AS_STRING(repr)[0];
Guido van Rossumd57fd912000-03-10 22:53:23 +00002234
2235 *p = '\0';
Tim Peters5de98422002-04-27 18:44:32 +00002236 _PyString_Resize(&repr, p - PyString_AS_STRING(repr));
Guido van Rossumd57fd912000-03-10 22:53:23 +00002237 return repr;
2238}
2239
2240PyObject *PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002241 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002242{
2243 return unicodeescape_string(s, size, 0);
2244}
2245
2246PyObject *PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
2247{
2248 if (!PyUnicode_Check(unicode)) {
2249 PyErr_BadArgument();
2250 return NULL;
2251 }
2252 return PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
2253 PyUnicode_GET_SIZE(unicode));
2254}
2255
2256/* --- Raw Unicode Escape Codec ------------------------------------------- */
2257
2258PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002259 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00002260 const char *errors)
2261{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002262 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002263 Py_ssize_t startinpos;
2264 Py_ssize_t endinpos;
2265 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002266 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002267 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002268 const char *end;
2269 const char *bs;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002270 PyObject *errorHandler = NULL;
2271 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00002272
Guido van Rossumd57fd912000-03-10 22:53:23 +00002273 /* Escaped strings will always be longer than the resulting
2274 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002275 length after conversion to the true value. (But decoding error
2276 handler might have to resize the string) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002277 v = _PyUnicode_New(size);
2278 if (v == NULL)
2279 goto onError;
2280 if (size == 0)
2281 return (PyObject *)v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002282 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002283 end = s + size;
2284 while (s < end) {
2285 unsigned char c;
Martin v. Löwis047c05e2002-03-21 08:55:28 +00002286 Py_UCS4 x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002287 int i;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002288 int count;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002289
2290 /* Non-escape characters are interpreted as Unicode ordinals */
2291 if (*s != '\\') {
2292 *p++ = (unsigned char)*s++;
2293 continue;
2294 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002295 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002296
2297 /* \u-escapes are only interpreted iff the number of leading
2298 backslashes if odd */
2299 bs = s;
2300 for (;s < end;) {
2301 if (*s != '\\')
2302 break;
2303 *p++ = (unsigned char)*s++;
2304 }
2305 if (((s - bs) & 1) == 0 ||
2306 s >= end ||
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002307 (*s != 'u' && *s != 'U')) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002308 continue;
2309 }
2310 p--;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002311 count = *s=='u' ? 4 : 8;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002312 s++;
2313
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002314 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002315 outpos = p-PyUnicode_AS_UNICODE(v);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002316 for (x = 0, i = 0; i < count; ++i, ++s) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002317 c = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002318 if (!isxdigit(c)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002319 endinpos = s-starts;
2320 if (unicode_decode_call_errorhandler(
2321 errors, &errorHandler,
2322 "rawunicodeescape", "truncated \\uXXXX",
2323 starts, size, &startinpos, &endinpos, &exc, &s,
2324 (PyObject **)&v, &outpos, &p))
Guido van Rossumd57fd912000-03-10 22:53:23 +00002325 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002326 goto nextByte;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002327 }
2328 x = (x<<4) & ~0xF;
2329 if (c >= '0' && c <= '9')
2330 x += c - '0';
2331 else if (c >= 'a' && c <= 'f')
2332 x += 10 + c - 'a';
2333 else
2334 x += 10 + c - 'A';
2335 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002336#ifndef Py_UNICODE_WIDE
2337 if (x > 0x10000) {
2338 if (unicode_decode_call_errorhandler(
2339 errors, &errorHandler,
2340 "rawunicodeescape", "\\Uxxxxxxxx out of range",
2341 starts, size, &startinpos, &endinpos, &exc, &s,
2342 (PyObject **)&v, &outpos, &p))
2343 goto onError;
2344 }
2345#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002346 *p++ = x;
2347 nextByte:
2348 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002349 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002350 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002351 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002352 Py_XDECREF(errorHandler);
2353 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002354 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00002355
Guido van Rossumd57fd912000-03-10 22:53:23 +00002356 onError:
2357 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002358 Py_XDECREF(errorHandler);
2359 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002360 return NULL;
2361}
2362
2363PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002364 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002365{
2366 PyObject *repr;
2367 char *p;
2368 char *q;
2369
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00002370 static const char *hexdigit = "0123456789abcdef";
Guido van Rossumd57fd912000-03-10 22:53:23 +00002371
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002372#ifdef Py_UNICODE_WIDE
2373 repr = PyString_FromStringAndSize(NULL, 10 * size);
2374#else
Guido van Rossumd57fd912000-03-10 22:53:23 +00002375 repr = PyString_FromStringAndSize(NULL, 6 * size);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002376#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00002377 if (repr == NULL)
2378 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00002379 if (size == 0)
2380 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002381
2382 p = q = PyString_AS_STRING(repr);
2383 while (size-- > 0) {
2384 Py_UNICODE ch = *s++;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002385#ifdef Py_UNICODE_WIDE
2386 /* Map 32-bit characters to '\Uxxxxxxxx' */
2387 if (ch >= 0x10000) {
2388 *p++ = '\\';
2389 *p++ = 'U';
2390 *p++ = hexdigit[(ch >> 28) & 0xf];
2391 *p++ = hexdigit[(ch >> 24) & 0xf];
2392 *p++ = hexdigit[(ch >> 20) & 0xf];
2393 *p++ = hexdigit[(ch >> 16) & 0xf];
2394 *p++ = hexdigit[(ch >> 12) & 0xf];
2395 *p++ = hexdigit[(ch >> 8) & 0xf];
2396 *p++ = hexdigit[(ch >> 4) & 0xf];
2397 *p++ = hexdigit[ch & 15];
Tim Petersced69f82003-09-16 20:30:58 +00002398 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002399 else
2400#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00002401 /* Map 16-bit characters to '\uxxxx' */
2402 if (ch >= 256) {
2403 *p++ = '\\';
2404 *p++ = 'u';
2405 *p++ = hexdigit[(ch >> 12) & 0xf];
2406 *p++ = hexdigit[(ch >> 8) & 0xf];
2407 *p++ = hexdigit[(ch >> 4) & 0xf];
2408 *p++ = hexdigit[ch & 15];
2409 }
2410 /* Copy everything else as-is */
2411 else
2412 *p++ = (char) ch;
2413 }
2414 *p = '\0';
Tim Peters5de98422002-04-27 18:44:32 +00002415 _PyString_Resize(&repr, p - q);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002416 return repr;
2417}
2418
2419PyObject *PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
2420{
2421 if (!PyUnicode_Check(unicode)) {
2422 PyErr_BadArgument();
2423 return NULL;
2424 }
2425 return PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
2426 PyUnicode_GET_SIZE(unicode));
2427}
2428
Walter Dörwalda47d1c02005-08-30 10:23:14 +00002429/* --- Unicode Internal Codec ------------------------------------------- */
2430
2431PyObject *_PyUnicode_DecodeUnicodeInternal(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002432 Py_ssize_t size,
Walter Dörwalda47d1c02005-08-30 10:23:14 +00002433 const char *errors)
2434{
2435 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002436 Py_ssize_t startinpos;
2437 Py_ssize_t endinpos;
2438 Py_ssize_t outpos;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00002439 PyUnicodeObject *v;
2440 Py_UNICODE *p;
2441 const char *end;
2442 const char *reason;
2443 PyObject *errorHandler = NULL;
2444 PyObject *exc = NULL;
2445
Neal Norwitzd43069c2006-01-08 01:12:10 +00002446#ifdef Py_UNICODE_WIDE
2447 Py_UNICODE unimax = PyUnicode_GetMax();
2448#endif
2449
Thomas Wouters89f507f2006-12-13 04:49:30 +00002450 /* XXX overflow detection missing */
Walter Dörwalda47d1c02005-08-30 10:23:14 +00002451 v = _PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE);
2452 if (v == NULL)
2453 goto onError;
2454 if (PyUnicode_GetSize((PyObject *)v) == 0)
2455 return (PyObject *)v;
2456 p = PyUnicode_AS_UNICODE(v);
2457 end = s + size;
2458
2459 while (s < end) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00002460 memcpy(p, s, sizeof(Py_UNICODE));
Walter Dörwalda47d1c02005-08-30 10:23:14 +00002461 /* We have to sanity check the raw data, otherwise doom looms for
2462 some malformed UCS-4 data. */
2463 if (
2464 #ifdef Py_UNICODE_WIDE
2465 *p > unimax || *p < 0 ||
2466 #endif
2467 end-s < Py_UNICODE_SIZE
2468 )
2469 {
2470 startinpos = s - starts;
2471 if (end-s < Py_UNICODE_SIZE) {
2472 endinpos = end-starts;
2473 reason = "truncated input";
2474 }
2475 else {
2476 endinpos = s - starts + Py_UNICODE_SIZE;
2477 reason = "illegal code point (> 0x10FFFF)";
2478 }
2479 outpos = p - PyUnicode_AS_UNICODE(v);
2480 if (unicode_decode_call_errorhandler(
2481 errors, &errorHandler,
2482 "unicode_internal", reason,
2483 starts, size, &startinpos, &endinpos, &exc, &s,
2484 (PyObject **)&v, &outpos, &p)) {
2485 goto onError;
2486 }
2487 }
2488 else {
2489 p++;
2490 s += Py_UNICODE_SIZE;
2491 }
2492 }
2493
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002494 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00002495 goto onError;
2496 Py_XDECREF(errorHandler);
2497 Py_XDECREF(exc);
2498 return (PyObject *)v;
2499
2500 onError:
2501 Py_XDECREF(v);
2502 Py_XDECREF(errorHandler);
2503 Py_XDECREF(exc);
2504 return NULL;
2505}
2506
Guido van Rossumd57fd912000-03-10 22:53:23 +00002507/* --- Latin-1 Codec ------------------------------------------------------ */
2508
2509PyObject *PyUnicode_DecodeLatin1(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002510 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00002511 const char *errors)
2512{
2513 PyUnicodeObject *v;
2514 Py_UNICODE *p;
Tim Petersced69f82003-09-16 20:30:58 +00002515
Guido van Rossumd57fd912000-03-10 22:53:23 +00002516 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002517 if (size == 1) {
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002518 Py_UNICODE r = *(unsigned char*)s;
2519 return PyUnicode_FromUnicode(&r, 1);
2520 }
2521
Guido van Rossumd57fd912000-03-10 22:53:23 +00002522 v = _PyUnicode_New(size);
2523 if (v == NULL)
2524 goto onError;
2525 if (size == 0)
2526 return (PyObject *)v;
2527 p = PyUnicode_AS_UNICODE(v);
2528 while (size-- > 0)
2529 *p++ = (unsigned char)*s++;
2530 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00002531
Guido van Rossumd57fd912000-03-10 22:53:23 +00002532 onError:
2533 Py_XDECREF(v);
2534 return NULL;
2535}
2536
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002537/* create or adjust a UnicodeEncodeError */
2538static void make_encode_exception(PyObject **exceptionObject,
2539 const char *encoding,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002540 const Py_UNICODE *unicode, Py_ssize_t size,
2541 Py_ssize_t startpos, Py_ssize_t endpos,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002542 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002543{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002544 if (*exceptionObject == NULL) {
2545 *exceptionObject = PyUnicodeEncodeError_Create(
2546 encoding, unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002547 }
2548 else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002549 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
2550 goto onError;
2551 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
2552 goto onError;
2553 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
2554 goto onError;
2555 return;
2556 onError:
2557 Py_DECREF(*exceptionObject);
2558 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002559 }
2560}
2561
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002562/* raises a UnicodeEncodeError */
2563static void raise_encode_exception(PyObject **exceptionObject,
2564 const char *encoding,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002565 const Py_UNICODE *unicode, Py_ssize_t size,
2566 Py_ssize_t startpos, Py_ssize_t endpos,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002567 const char *reason)
2568{
2569 make_encode_exception(exceptionObject,
2570 encoding, unicode, size, startpos, endpos, reason);
2571 if (*exceptionObject != NULL)
2572 PyCodec_StrictErrors(*exceptionObject);
2573}
2574
2575/* error handling callback helper:
2576 build arguments, call the callback and check the arguments,
2577 put the result into newpos and return the replacement string, which
2578 has to be freed by the caller */
2579static PyObject *unicode_encode_call_errorhandler(const char *errors,
2580 PyObject **errorHandler,
2581 const char *encoding, const char *reason,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002582 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
2583 Py_ssize_t startpos, Py_ssize_t endpos,
2584 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002585{
Martin v. Löwis18e16552006-02-15 17:27:45 +00002586 static char *argparse = "O!n;encoding error handler must return (unicode, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002587
2588 PyObject *restuple;
2589 PyObject *resunicode;
2590
2591 if (*errorHandler == NULL) {
2592 *errorHandler = PyCodec_LookupError(errors);
2593 if (*errorHandler == NULL)
2594 return NULL;
2595 }
2596
2597 make_encode_exception(exceptionObject,
2598 encoding, unicode, size, startpos, endpos, reason);
2599 if (*exceptionObject == NULL)
2600 return NULL;
2601
2602 restuple = PyObject_CallFunctionObjArgs(
2603 *errorHandler, *exceptionObject, NULL);
2604 if (restuple == NULL)
2605 return NULL;
2606 if (!PyTuple_Check(restuple)) {
2607 PyErr_Format(PyExc_TypeError, &argparse[4]);
2608 Py_DECREF(restuple);
2609 return NULL;
2610 }
2611 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
2612 &resunicode, newpos)) {
2613 Py_DECREF(restuple);
2614 return NULL;
2615 }
2616 if (*newpos<0)
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00002617 *newpos = size+*newpos;
2618 if (*newpos<0 || *newpos>size) {
Martin v. Löwis2c95cc62006-02-16 06:54:25 +00002619 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00002620 Py_DECREF(restuple);
2621 return NULL;
2622 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002623 Py_INCREF(resunicode);
2624 Py_DECREF(restuple);
2625 return resunicode;
2626}
2627
2628static PyObject *unicode_encode_ucs1(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002629 Py_ssize_t size,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002630 const char *errors,
2631 int limit)
2632{
2633 /* output object */
2634 PyObject *res;
2635 /* pointers to the beginning and end+1 of input */
2636 const Py_UNICODE *startp = p;
2637 const Py_UNICODE *endp = p + size;
2638 /* pointer to the beginning of the unencodable characters */
2639 /* const Py_UNICODE *badp = NULL; */
2640 /* pointer into the output */
2641 char *str;
2642 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00002643 Py_ssize_t respos = 0;
2644 Py_ssize_t ressize;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002645 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
2646 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002647 PyObject *errorHandler = NULL;
2648 PyObject *exc = NULL;
2649 /* the following variable is used for caching string comparisons
2650 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
2651 int known_errorHandler = -1;
2652
2653 /* allocate enough for a simple encoding without
2654 replacements, if we need more, we'll resize */
Guido van Rossumf15a29f2007-05-04 00:41:39 +00002655 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002656 if (res == NULL)
2657 goto onError;
2658 if (size == 0)
2659 return res;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00002660 str = PyBytes_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002661 ressize = size;
2662
2663 while (p<endp) {
2664 Py_UNICODE c = *p;
2665
2666 /* can we encode this? */
2667 if (c<limit) {
2668 /* no overflow check, because we know that the space is enough */
2669 *str++ = (char)c;
2670 ++p;
2671 }
2672 else {
Martin v. Löwis18e16552006-02-15 17:27:45 +00002673 Py_ssize_t unicodepos = p-startp;
2674 Py_ssize_t requiredsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002675 PyObject *repunicode;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002676 Py_ssize_t repsize;
2677 Py_ssize_t newpos;
2678 Py_ssize_t respos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002679 Py_UNICODE *uni2;
2680 /* startpos for collecting unencodable chars */
2681 const Py_UNICODE *collstart = p;
2682 const Py_UNICODE *collend = p;
2683 /* find all unecodable characters */
2684 while ((collend < endp) && ((*collend)>=limit))
2685 ++collend;
2686 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
2687 if (known_errorHandler==-1) {
2688 if ((errors==NULL) || (!strcmp(errors, "strict")))
2689 known_errorHandler = 1;
2690 else if (!strcmp(errors, "replace"))
2691 known_errorHandler = 2;
2692 else if (!strcmp(errors, "ignore"))
2693 known_errorHandler = 3;
2694 else if (!strcmp(errors, "xmlcharrefreplace"))
2695 known_errorHandler = 4;
2696 else
2697 known_errorHandler = 0;
2698 }
2699 switch (known_errorHandler) {
2700 case 1: /* strict */
2701 raise_encode_exception(&exc, encoding, startp, size, collstart-startp, collend-startp, reason);
2702 goto onError;
2703 case 2: /* replace */
2704 while (collstart++<collend)
2705 *str++ = '?'; /* fall through */
2706 case 3: /* ignore */
2707 p = collend;
2708 break;
2709 case 4: /* xmlcharrefreplace */
Guido van Rossumf15a29f2007-05-04 00:41:39 +00002710 respos = str - PyBytes_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002711 /* determine replacement size (temporarily (mis)uses p) */
2712 for (p = collstart, repsize = 0; p < collend; ++p) {
2713 if (*p<10)
2714 repsize += 2+1+1;
2715 else if (*p<100)
2716 repsize += 2+2+1;
2717 else if (*p<1000)
2718 repsize += 2+3+1;
2719 else if (*p<10000)
2720 repsize += 2+4+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00002721#ifndef Py_UNICODE_WIDE
2722 else
2723 repsize += 2+5+1;
2724#else
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002725 else if (*p<100000)
2726 repsize += 2+5+1;
2727 else if (*p<1000000)
2728 repsize += 2+6+1;
2729 else
2730 repsize += 2+7+1;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002731#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002732 }
2733 requiredsize = respos+repsize+(endp-collend);
2734 if (requiredsize > ressize) {
2735 if (requiredsize<2*ressize)
2736 requiredsize = 2*ressize;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00002737 if (PyBytes_Resize(res, requiredsize))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002738 goto onError;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00002739 str = PyBytes_AS_STRING(res) + respos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002740 ressize = requiredsize;
2741 }
2742 /* generate replacement (temporarily (mis)uses p) */
2743 for (p = collstart; p < collend; ++p) {
2744 str += sprintf(str, "&#%d;", (int)*p);
2745 }
2746 p = collend;
2747 break;
2748 default:
2749 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
2750 encoding, reason, startp, size, &exc,
2751 collstart-startp, collend-startp, &newpos);
2752 if (repunicode == NULL)
2753 goto onError;
2754 /* need more space? (at least enough for what we
2755 have+the replacement+the rest of the string, so
2756 we won't have to check space for encodable characters) */
Guido van Rossumf15a29f2007-05-04 00:41:39 +00002757 respos = str - PyBytes_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002758 repsize = PyUnicode_GET_SIZE(repunicode);
2759 requiredsize = respos+repsize+(endp-collend);
2760 if (requiredsize > ressize) {
2761 if (requiredsize<2*ressize)
2762 requiredsize = 2*ressize;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00002763 if (PyBytes_Resize(res, requiredsize)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002764 Py_DECREF(repunicode);
2765 goto onError;
2766 }
Guido van Rossumf15a29f2007-05-04 00:41:39 +00002767 str = PyBytes_AS_STRING(res) + respos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002768 ressize = requiredsize;
2769 }
2770 /* check if there is anything unencodable in the replacement
2771 and copy it to the output */
2772 for (uni2 = PyUnicode_AS_UNICODE(repunicode);repsize-->0; ++uni2, ++str) {
2773 c = *uni2;
2774 if (c >= limit) {
2775 raise_encode_exception(&exc, encoding, startp, size,
2776 unicodepos, unicodepos+1, reason);
2777 Py_DECREF(repunicode);
2778 goto onError;
2779 }
2780 *str = (char)c;
2781 }
2782 p = startp + newpos;
2783 Py_DECREF(repunicode);
2784 }
2785 }
2786 }
2787 /* Resize if we allocated to much */
Guido van Rossumf15a29f2007-05-04 00:41:39 +00002788 respos = str - PyBytes_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002789 if (respos<ressize)
2790 /* If this falls res will be NULL */
Guido van Rossumf15a29f2007-05-04 00:41:39 +00002791 PyBytes_Resize(res, respos);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002792 Py_XDECREF(errorHandler);
2793 Py_XDECREF(exc);
2794 return res;
2795
2796 onError:
2797 Py_XDECREF(res);
2798 Py_XDECREF(errorHandler);
2799 Py_XDECREF(exc);
2800 return NULL;
2801}
2802
Guido van Rossumd57fd912000-03-10 22:53:23 +00002803PyObject *PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002804 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00002805 const char *errors)
2806{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002807 return unicode_encode_ucs1(p, size, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002808}
2809
2810PyObject *PyUnicode_AsLatin1String(PyObject *unicode)
2811{
2812 if (!PyUnicode_Check(unicode)) {
2813 PyErr_BadArgument();
2814 return NULL;
2815 }
2816 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
2817 PyUnicode_GET_SIZE(unicode),
2818 NULL);
2819}
2820
2821/* --- 7-bit ASCII Codec -------------------------------------------------- */
2822
Guido van Rossumd57fd912000-03-10 22:53:23 +00002823PyObject *PyUnicode_DecodeASCII(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002824 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00002825 const char *errors)
2826{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002827 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002828 PyUnicodeObject *v;
2829 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002830 Py_ssize_t startinpos;
2831 Py_ssize_t endinpos;
2832 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002833 const char *e;
2834 PyObject *errorHandler = NULL;
2835 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00002836
Guido van Rossumd57fd912000-03-10 22:53:23 +00002837 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002838 if (size == 1 && *(unsigned char*)s < 128) {
2839 Py_UNICODE r = *(unsigned char*)s;
2840 return PyUnicode_FromUnicode(&r, 1);
2841 }
Tim Petersced69f82003-09-16 20:30:58 +00002842
Guido van Rossumd57fd912000-03-10 22:53:23 +00002843 v = _PyUnicode_New(size);
2844 if (v == NULL)
2845 goto onError;
2846 if (size == 0)
2847 return (PyObject *)v;
2848 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002849 e = s + size;
2850 while (s < e) {
2851 register unsigned char c = (unsigned char)*s;
2852 if (c < 128) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002853 *p++ = c;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002854 ++s;
2855 }
2856 else {
2857 startinpos = s-starts;
2858 endinpos = startinpos + 1;
Jeremy Hyltond8082792003-09-16 19:41:39 +00002859 outpos = p - (Py_UNICODE *)PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002860 if (unicode_decode_call_errorhandler(
2861 errors, &errorHandler,
2862 "ascii", "ordinal not in range(128)",
2863 starts, size, &startinpos, &endinpos, &exc, &s,
2864 (PyObject **)&v, &outpos, &p))
Guido van Rossumd57fd912000-03-10 22:53:23 +00002865 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002866 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002867 }
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002868 if (p - PyUnicode_AS_UNICODE(v) < PyString_GET_SIZE(v))
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002869 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002870 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002871 Py_XDECREF(errorHandler);
2872 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002873 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00002874
Guido van Rossumd57fd912000-03-10 22:53:23 +00002875 onError:
2876 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002877 Py_XDECREF(errorHandler);
2878 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002879 return NULL;
2880}
2881
Guido van Rossumd57fd912000-03-10 22:53:23 +00002882PyObject *PyUnicode_EncodeASCII(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002883 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00002884 const char *errors)
2885{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002886 return unicode_encode_ucs1(p, size, errors, 128);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002887}
2888
2889PyObject *PyUnicode_AsASCIIString(PyObject *unicode)
2890{
2891 if (!PyUnicode_Check(unicode)) {
2892 PyErr_BadArgument();
2893 return NULL;
2894 }
2895 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
2896 PyUnicode_GET_SIZE(unicode),
2897 NULL);
2898}
2899
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00002900#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Guido van Rossum2ea3e142000-03-31 17:24:09 +00002901
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002902/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00002903
Thomas Wouters0e3f5912006-08-11 14:57:12 +00002904#if SIZEOF_INT < SIZEOF_SSIZE_T
2905#define NEED_RETRY
2906#endif
2907
2908/* XXX This code is limited to "true" double-byte encodings, as
2909 a) it assumes an incomplete character consists of a single byte, and
2910 b) IsDBCSLeadByte (probably) does not work for non-DBCS multi-byte
2911 encodings, see IsDBCSLeadByteEx documentation. */
2912
2913static int is_dbcs_lead_byte(const char *s, int offset)
2914{
2915 const char *curr = s + offset;
2916
2917 if (IsDBCSLeadByte(*curr)) {
2918 const char *prev = CharPrev(s, curr);
2919 return (prev == curr) || !IsDBCSLeadByte(*prev) || (curr - prev == 2);
2920 }
2921 return 0;
2922}
2923
2924/*
2925 * Decode MBCS string into unicode object. If 'final' is set, converts
2926 * trailing lead-byte too. Returns consumed size if succeed, -1 otherwise.
2927 */
2928static int decode_mbcs(PyUnicodeObject **v,
2929 const char *s, /* MBCS string */
2930 int size, /* sizeof MBCS string */
2931 int final)
2932{
2933 Py_UNICODE *p;
2934 Py_ssize_t n = 0;
2935 int usize = 0;
2936
2937 assert(size >= 0);
2938
2939 /* Skip trailing lead-byte unless 'final' is set */
2940 if (!final && size >= 1 && is_dbcs_lead_byte(s, size - 1))
2941 --size;
2942
2943 /* First get the size of the result */
2944 if (size > 0) {
2945 usize = MultiByteToWideChar(CP_ACP, 0, s, size, NULL, 0);
2946 if (usize == 0) {
2947 PyErr_SetFromWindowsErrWithFilename(0, NULL);
2948 return -1;
2949 }
2950 }
2951
2952 if (*v == NULL) {
2953 /* Create unicode object */
2954 *v = _PyUnicode_New(usize);
2955 if (*v == NULL)
2956 return -1;
2957 }
2958 else {
2959 /* Extend unicode object */
2960 n = PyUnicode_GET_SIZE(*v);
2961 if (_PyUnicode_Resize(v, n + usize) < 0)
2962 return -1;
2963 }
2964
2965 /* Do the conversion */
2966 if (size > 0) {
2967 p = PyUnicode_AS_UNICODE(*v) + n;
2968 if (0 == MultiByteToWideChar(CP_ACP, 0, s, size, p, usize)) {
2969 PyErr_SetFromWindowsErrWithFilename(0, NULL);
2970 return -1;
2971 }
2972 }
2973
2974 return size;
2975}
2976
2977PyObject *PyUnicode_DecodeMBCSStateful(const char *s,
2978 Py_ssize_t size,
2979 const char *errors,
2980 Py_ssize_t *consumed)
2981{
2982 PyUnicodeObject *v = NULL;
2983 int done;
2984
2985 if (consumed)
2986 *consumed = 0;
2987
2988#ifdef NEED_RETRY
2989 retry:
2990 if (size > INT_MAX)
2991 done = decode_mbcs(&v, s, INT_MAX, 0);
2992 else
2993#endif
2994 done = decode_mbcs(&v, s, (int)size, !consumed);
2995
2996 if (done < 0) {
2997 Py_XDECREF(v);
2998 return NULL;
2999 }
3000
3001 if (consumed)
3002 *consumed += done;
3003
3004#ifdef NEED_RETRY
3005 if (size > INT_MAX) {
3006 s += done;
3007 size -= done;
3008 goto retry;
3009 }
3010#endif
3011
3012 return (PyObject *)v;
3013}
3014
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003015PyObject *PyUnicode_DecodeMBCS(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003016 Py_ssize_t size,
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003017 const char *errors)
3018{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003019 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
3020}
3021
3022/*
3023 * Convert unicode into string object (MBCS).
3024 * Returns 0 if succeed, -1 otherwise.
3025 */
3026static int encode_mbcs(PyObject **repr,
3027 const Py_UNICODE *p, /* unicode */
3028 int size) /* size of unicode */
3029{
3030 int mbcssize = 0;
3031 Py_ssize_t n = 0;
3032
3033 assert(size >= 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003034
3035 /* First get the size of the result */
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003036 if (size > 0) {
3037 mbcssize = WideCharToMultiByte(CP_ACP, 0, p, size, NULL, 0, NULL, NULL);
3038 if (mbcssize == 0) {
3039 PyErr_SetFromWindowsErrWithFilename(0, NULL);
3040 return -1;
3041 }
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003042 }
3043
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003044 if (*repr == NULL) {
3045 /* Create string object */
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003046 *repr = PyBytes_FromStringAndSize(NULL, mbcssize);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003047 if (*repr == NULL)
3048 return -1;
3049 }
3050 else {
3051 /* Extend string object */
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003052 n = PyBytes_Size(*repr);
3053 if (PyBytes_Resize(*repr, n + mbcssize) < 0)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003054 return -1;
3055 }
3056
3057 /* Do the conversion */
3058 if (size > 0) {
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003059 char *s = PyBytes_AS_STRING(*repr) + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003060 if (0 == WideCharToMultiByte(CP_ACP, 0, p, size, s, mbcssize, NULL, NULL)) {
3061 PyErr_SetFromWindowsErrWithFilename(0, NULL);
3062 return -1;
3063 }
3064 }
3065
3066 return 0;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003067}
3068
3069PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003070 Py_ssize_t size,
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003071 const char *errors)
3072{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003073 PyObject *repr = NULL;
3074 int ret;
Guido van Rossum03e29f12000-05-04 15:52:20 +00003075
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003076#ifdef NEED_RETRY
3077 retry:
3078 if (size > INT_MAX)
3079 ret = encode_mbcs(&repr, p, INT_MAX);
3080 else
3081#endif
3082 ret = encode_mbcs(&repr, p, (int)size);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003083
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003084 if (ret < 0) {
3085 Py_XDECREF(repr);
3086 return NULL;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003087 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003088
3089#ifdef NEED_RETRY
3090 if (size > INT_MAX) {
3091 p += INT_MAX;
3092 size -= INT_MAX;
3093 goto retry;
3094 }
3095#endif
3096
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003097 return repr;
3098}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00003099
Mark Hammond0ccda1e2003-07-01 00:13:27 +00003100PyObject *PyUnicode_AsMBCSString(PyObject *unicode)
3101{
3102 if (!PyUnicode_Check(unicode)) {
3103 PyErr_BadArgument();
3104 return NULL;
3105 }
3106 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
3107 PyUnicode_GET_SIZE(unicode),
3108 NULL);
3109}
3110
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003111#undef NEED_RETRY
3112
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00003113#endif /* MS_WINDOWS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003114
Guido van Rossumd57fd912000-03-10 22:53:23 +00003115/* --- Character Mapping Codec -------------------------------------------- */
3116
Guido van Rossumd57fd912000-03-10 22:53:23 +00003117PyObject *PyUnicode_DecodeCharmap(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003118 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003119 PyObject *mapping,
3120 const char *errors)
3121{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003122 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003123 Py_ssize_t startinpos;
3124 Py_ssize_t endinpos;
3125 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003126 const char *e;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003127 PyUnicodeObject *v;
3128 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003129 Py_ssize_t extrachars = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003130 PyObject *errorHandler = NULL;
3131 PyObject *exc = NULL;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003132 Py_UNICODE *mapstring = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003133 Py_ssize_t maplen = 0;
Tim Petersced69f82003-09-16 20:30:58 +00003134
Guido van Rossumd57fd912000-03-10 22:53:23 +00003135 /* Default to Latin-1 */
3136 if (mapping == NULL)
3137 return PyUnicode_DecodeLatin1(s, size, errors);
3138
3139 v = _PyUnicode_New(size);
3140 if (v == NULL)
3141 goto onError;
3142 if (size == 0)
3143 return (PyObject *)v;
3144 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003145 e = s + size;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003146 if (PyUnicode_CheckExact(mapping)) {
3147 mapstring = PyUnicode_AS_UNICODE(mapping);
3148 maplen = PyUnicode_GET_SIZE(mapping);
3149 while (s < e) {
3150 unsigned char ch = *s;
3151 Py_UNICODE x = 0xfffe; /* illegal value */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003152
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003153 if (ch < maplen)
3154 x = mapstring[ch];
Guido van Rossumd57fd912000-03-10 22:53:23 +00003155
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003156 if (x == 0xfffe) {
3157 /* undefined mapping */
3158 outpos = p-PyUnicode_AS_UNICODE(v);
3159 startinpos = s-starts;
3160 endinpos = startinpos+1;
3161 if (unicode_decode_call_errorhandler(
3162 errors, &errorHandler,
3163 "charmap", "character maps to <undefined>",
3164 starts, size, &startinpos, &endinpos, &exc, &s,
3165 (PyObject **)&v, &outpos, &p)) {
3166 goto onError;
Marc-André Lemburgec233e52001-01-06 14:59:58 +00003167 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003168 continue;
Marc-André Lemburgec233e52001-01-06 14:59:58 +00003169 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003170 *p++ = x;
3171 ++s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003172 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003173 }
3174 else {
3175 while (s < e) {
3176 unsigned char ch = *s;
3177 PyObject *w, *x;
3178
3179 /* Get mapping (char ordinal -> integer, Unicode char or None) */
3180 w = PyInt_FromLong((long)ch);
3181 if (w == NULL)
3182 goto onError;
3183 x = PyObject_GetItem(mapping, w);
3184 Py_DECREF(w);
3185 if (x == NULL) {
3186 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
3187 /* No mapping found means: mapping is undefined. */
3188 PyErr_Clear();
3189 x = Py_None;
3190 Py_INCREF(x);
3191 } else
3192 goto onError;
3193 }
3194
3195 /* Apply mapping */
3196 if (PyInt_Check(x)) {
3197 long value = PyInt_AS_LONG(x);
3198 if (value < 0 || value > 65535) {
3199 PyErr_SetString(PyExc_TypeError,
3200 "character mapping must be in range(65536)");
3201 Py_DECREF(x);
3202 goto onError;
3203 }
3204 *p++ = (Py_UNICODE)value;
3205 }
3206 else if (x == Py_None) {
3207 /* undefined mapping */
3208 outpos = p-PyUnicode_AS_UNICODE(v);
3209 startinpos = s-starts;
3210 endinpos = startinpos+1;
3211 if (unicode_decode_call_errorhandler(
3212 errors, &errorHandler,
3213 "charmap", "character maps to <undefined>",
3214 starts, size, &startinpos, &endinpos, &exc, &s,
3215 (PyObject **)&v, &outpos, &p)) {
3216 Py_DECREF(x);
3217 goto onError;
3218 }
Walter Dörwaldd4fff172005-11-28 22:15:56 +00003219 Py_DECREF(x);
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003220 continue;
3221 }
3222 else if (PyUnicode_Check(x)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00003223 Py_ssize_t targetsize = PyUnicode_GET_SIZE(x);
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003224
3225 if (targetsize == 1)
3226 /* 1-1 mapping */
3227 *p++ = *PyUnicode_AS_UNICODE(x);
3228
3229 else if (targetsize > 1) {
3230 /* 1-n mapping */
3231 if (targetsize > extrachars) {
3232 /* resize first */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003233 Py_ssize_t oldpos = p - PyUnicode_AS_UNICODE(v);
3234 Py_ssize_t needed = (targetsize - extrachars) + \
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003235 (targetsize << 2);
3236 extrachars += needed;
Thomas Wouters89f507f2006-12-13 04:49:30 +00003237 /* XXX overflow detection missing */
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003238 if (_PyUnicode_Resize(&v,
3239 PyUnicode_GET_SIZE(v) + needed) < 0) {
3240 Py_DECREF(x);
3241 goto onError;
3242 }
3243 p = PyUnicode_AS_UNICODE(v) + oldpos;
3244 }
3245 Py_UNICODE_COPY(p,
3246 PyUnicode_AS_UNICODE(x),
3247 targetsize);
3248 p += targetsize;
3249 extrachars -= targetsize;
3250 }
3251 /* 1-0 mapping: skip the character */
3252 }
3253 else {
3254 /* wrong return value */
3255 PyErr_SetString(PyExc_TypeError,
3256 "character mapping must return integer, None or unicode");
3257 Py_DECREF(x);
3258 goto onError;
3259 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003260 Py_DECREF(x);
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003261 ++s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003262 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003263 }
3264 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003265 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003266 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003267 Py_XDECREF(errorHandler);
3268 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003269 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00003270
Guido van Rossumd57fd912000-03-10 22:53:23 +00003271 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003272 Py_XDECREF(errorHandler);
3273 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003274 Py_XDECREF(v);
3275 return NULL;
3276}
3277
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003278/* Charmap encoding: the lookup table */
3279
3280struct encoding_map{
3281 PyObject_HEAD
3282 unsigned char level1[32];
3283 int count2, count3;
3284 unsigned char level23[1];
3285};
3286
3287static PyObject*
3288encoding_map_size(PyObject *obj, PyObject* args)
3289{
3290 struct encoding_map *map = (struct encoding_map*)obj;
3291 return PyInt_FromLong(sizeof(*map) - 1 + 16*map->count2 +
3292 128*map->count3);
3293}
3294
3295static PyMethodDef encoding_map_methods[] = {
3296 {"size", encoding_map_size, METH_NOARGS,
3297 PyDoc_STR("Return the size (in bytes) of this object") },
3298 { 0 }
3299};
3300
3301static void
3302encoding_map_dealloc(PyObject* o)
3303{
3304 PyObject_FREE(o);
3305}
3306
3307static PyTypeObject EncodingMapType = {
3308 PyObject_HEAD_INIT(NULL)
3309 0, /*ob_size*/
3310 "EncodingMap", /*tp_name*/
3311 sizeof(struct encoding_map), /*tp_basicsize*/
3312 0, /*tp_itemsize*/
3313 /* methods */
3314 encoding_map_dealloc, /*tp_dealloc*/
3315 0, /*tp_print*/
3316 0, /*tp_getattr*/
3317 0, /*tp_setattr*/
3318 0, /*tp_compare*/
3319 0, /*tp_repr*/
3320 0, /*tp_as_number*/
3321 0, /*tp_as_sequence*/
3322 0, /*tp_as_mapping*/
3323 0, /*tp_hash*/
3324 0, /*tp_call*/
3325 0, /*tp_str*/
3326 0, /*tp_getattro*/
3327 0, /*tp_setattro*/
3328 0, /*tp_as_buffer*/
3329 Py_TPFLAGS_DEFAULT, /*tp_flags*/
3330 0, /*tp_doc*/
3331 0, /*tp_traverse*/
3332 0, /*tp_clear*/
3333 0, /*tp_richcompare*/
3334 0, /*tp_weaklistoffset*/
3335 0, /*tp_iter*/
3336 0, /*tp_iternext*/
3337 encoding_map_methods, /*tp_methods*/
3338 0, /*tp_members*/
3339 0, /*tp_getset*/
3340 0, /*tp_base*/
3341 0, /*tp_dict*/
3342 0, /*tp_descr_get*/
3343 0, /*tp_descr_set*/
3344 0, /*tp_dictoffset*/
3345 0, /*tp_init*/
3346 0, /*tp_alloc*/
3347 0, /*tp_new*/
3348 0, /*tp_free*/
3349 0, /*tp_is_gc*/
3350};
3351
3352PyObject*
3353PyUnicode_BuildEncodingMap(PyObject* string)
3354{
3355 Py_UNICODE *decode;
3356 PyObject *result;
3357 struct encoding_map *mresult;
3358 int i;
3359 int need_dict = 0;
3360 unsigned char level1[32];
3361 unsigned char level2[512];
3362 unsigned char *mlevel1, *mlevel2, *mlevel3;
3363 int count2 = 0, count3 = 0;
3364
3365 if (!PyUnicode_Check(string) || PyUnicode_GetSize(string) != 256) {
3366 PyErr_BadArgument();
3367 return NULL;
3368 }
3369 decode = PyUnicode_AS_UNICODE(string);
3370 memset(level1, 0xFF, sizeof level1);
3371 memset(level2, 0xFF, sizeof level2);
3372
3373 /* If there isn't a one-to-one mapping of NULL to \0,
3374 or if there are non-BMP characters, we need to use
3375 a mapping dictionary. */
3376 if (decode[0] != 0)
3377 need_dict = 1;
3378 for (i = 1; i < 256; i++) {
3379 int l1, l2;
3380 if (decode[i] == 0
3381 #ifdef Py_UNICODE_WIDE
3382 || decode[i] > 0xFFFF
3383 #endif
3384 ) {
3385 need_dict = 1;
3386 break;
3387 }
3388 if (decode[i] == 0xFFFE)
3389 /* unmapped character */
3390 continue;
3391 l1 = decode[i] >> 11;
3392 l2 = decode[i] >> 7;
3393 if (level1[l1] == 0xFF)
3394 level1[l1] = count2++;
3395 if (level2[l2] == 0xFF)
3396 level2[l2] = count3++;
3397 }
3398
3399 if (count2 >= 0xFF || count3 >= 0xFF)
3400 need_dict = 1;
3401
3402 if (need_dict) {
3403 PyObject *result = PyDict_New();
3404 PyObject *key, *value;
3405 if (!result)
3406 return NULL;
3407 for (i = 0; i < 256; i++) {
3408 key = value = NULL;
3409 key = PyInt_FromLong(decode[i]);
3410 value = PyInt_FromLong(i);
3411 if (!key || !value)
3412 goto failed1;
3413 if (PyDict_SetItem(result, key, value) == -1)
3414 goto failed1;
3415 Py_DECREF(key);
3416 Py_DECREF(value);
3417 }
3418 return result;
3419 failed1:
3420 Py_XDECREF(key);
3421 Py_XDECREF(value);
3422 Py_DECREF(result);
3423 return NULL;
3424 }
3425
3426 /* Create a three-level trie */
3427 result = PyObject_MALLOC(sizeof(struct encoding_map) +
3428 16*count2 + 128*count3 - 1);
3429 if (!result)
3430 return PyErr_NoMemory();
3431 PyObject_Init(result, &EncodingMapType);
3432 mresult = (struct encoding_map*)result;
3433 mresult->count2 = count2;
3434 mresult->count3 = count3;
3435 mlevel1 = mresult->level1;
3436 mlevel2 = mresult->level23;
3437 mlevel3 = mresult->level23 + 16*count2;
3438 memcpy(mlevel1, level1, 32);
3439 memset(mlevel2, 0xFF, 16*count2);
3440 memset(mlevel3, 0, 128*count3);
3441 count3 = 0;
3442 for (i = 1; i < 256; i++) {
3443 int o1, o2, o3, i2, i3;
3444 if (decode[i] == 0xFFFE)
3445 /* unmapped character */
3446 continue;
3447 o1 = decode[i]>>11;
3448 o2 = (decode[i]>>7) & 0xF;
3449 i2 = 16*mlevel1[o1] + o2;
3450 if (mlevel2[i2] == 0xFF)
3451 mlevel2[i2] = count3++;
3452 o3 = decode[i] & 0x7F;
3453 i3 = 128*mlevel2[i2] + o3;
3454 mlevel3[i3] = i;
3455 }
3456 return result;
3457}
3458
3459static int
3460encoding_map_lookup(Py_UNICODE c, PyObject *mapping)
3461{
3462 struct encoding_map *map = (struct encoding_map*)mapping;
3463 int l1 = c>>11;
3464 int l2 = (c>>7) & 0xF;
3465 int l3 = c & 0x7F;
3466 int i;
3467
3468#ifdef Py_UNICODE_WIDE
3469 if (c > 0xFFFF) {
3470 return -1;
3471 }
3472#endif
3473 if (c == 0)
3474 return 0;
3475 /* level 1*/
3476 i = map->level1[l1];
3477 if (i == 0xFF) {
3478 return -1;
3479 }
3480 /* level 2*/
3481 i = map->level23[16*i+l2];
3482 if (i == 0xFF) {
3483 return -1;
3484 }
3485 /* level 3 */
3486 i = map->level23[16*map->count2 + 128*i + l3];
3487 if (i == 0) {
3488 return -1;
3489 }
3490 return i;
3491}
3492
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003493/* Lookup the character ch in the mapping. If the character
3494 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00003495 error occurred). */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003496static PyObject *charmapencode_lookup(Py_UNICODE c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003497{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003498 PyObject *w = PyInt_FromLong((long)c);
3499 PyObject *x;
3500
3501 if (w == NULL)
3502 return NULL;
3503 x = PyObject_GetItem(mapping, w);
3504 Py_DECREF(w);
3505 if (x == NULL) {
3506 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
3507 /* No mapping found means: mapping is undefined. */
3508 PyErr_Clear();
3509 x = Py_None;
3510 Py_INCREF(x);
3511 return x;
3512 } else
3513 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003514 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00003515 else if (x == Py_None)
3516 return x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003517 else if (PyInt_Check(x)) {
3518 long value = PyInt_AS_LONG(x);
3519 if (value < 0 || value > 255) {
3520 PyErr_SetString(PyExc_TypeError,
3521 "character mapping must be in range(256)");
3522 Py_DECREF(x);
3523 return NULL;
3524 }
3525 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003526 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003527 else if (PyString_Check(x))
3528 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003529 else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003530 /* wrong return value */
Walter Dörwald580ceed2007-05-09 10:39:19 +00003531 PyErr_Format(PyExc_TypeError,
3532 "character mapping must return integer, None or str8, not %.400s",
3533 x->ob_type->tp_name);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003534 Py_DECREF(x);
3535 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003536 }
3537}
3538
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003539static int
3540charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
3541{
3542 Py_ssize_t outsize = PyString_GET_SIZE(*outobj);
3543 /* exponentially overallocate to minimize reallocations */
3544 if (requiredsize < 2*outsize)
3545 requiredsize = 2*outsize;
3546 if (_PyString_Resize(outobj, requiredsize)) {
3547 return 0;
3548 }
3549 return 1;
3550}
3551
3552typedef enum charmapencode_result {
3553 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
3554}charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003555/* lookup the character, put the result in the output string and adjust
3556 various state variables. Reallocate the output string if not enough
3557 space is available. Return a new reference to the object that
3558 was put in the output buffer, or Py_None, if the mapping was undefined
3559 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00003560 reallocation error occurred. The caller must decref the result */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003561static
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003562charmapencode_result charmapencode_output(Py_UNICODE c, PyObject *mapping,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003563 PyObject **outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003564{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003565 PyObject *rep;
3566 char *outstart;
3567 Py_ssize_t outsize = PyString_GET_SIZE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003568
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003569 if (mapping->ob_type == &EncodingMapType) {
3570 int res = encoding_map_lookup(c, mapping);
3571 Py_ssize_t requiredsize = *outpos+1;
3572 if (res == -1)
3573 return enc_FAILED;
3574 if (outsize<requiredsize)
3575 if (!charmapencode_resize(outobj, outpos, requiredsize))
3576 return enc_EXCEPTION;
3577 outstart = PyString_AS_STRING(*outobj);
3578 outstart[(*outpos)++] = (char)res;
3579 return enc_SUCCESS;
3580 }
3581
3582 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003583 if (rep==NULL)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003584 return enc_EXCEPTION;
3585 else if (rep==Py_None) {
3586 Py_DECREF(rep);
3587 return enc_FAILED;
3588 } else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003589 if (PyInt_Check(rep)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00003590 Py_ssize_t requiredsize = *outpos+1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003591 if (outsize<requiredsize)
3592 if (!charmapencode_resize(outobj, outpos, requiredsize)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003593 Py_DECREF(rep);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003594 return enc_EXCEPTION;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003595 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003596 outstart = PyString_AS_STRING(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003597 outstart[(*outpos)++] = (char)PyInt_AS_LONG(rep);
3598 }
3599 else {
3600 const char *repchars = PyString_AS_STRING(rep);
Martin v. Löwis18e16552006-02-15 17:27:45 +00003601 Py_ssize_t repsize = PyString_GET_SIZE(rep);
3602 Py_ssize_t requiredsize = *outpos+repsize;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003603 if (outsize<requiredsize)
3604 if (!charmapencode_resize(outobj, outpos, requiredsize)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003605 Py_DECREF(rep);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003606 return enc_EXCEPTION;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003607 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003608 outstart = PyString_AS_STRING(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003609 memcpy(outstart + *outpos, repchars, repsize);
3610 *outpos += repsize;
3611 }
3612 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003613 Py_DECREF(rep);
3614 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003615}
3616
3617/* handle an error in PyUnicode_EncodeCharmap
3618 Return 0 on success, -1 on error */
3619static
3620int charmap_encoding_error(
Martin v. Löwis18e16552006-02-15 17:27:45 +00003621 const Py_UNICODE *p, Py_ssize_t size, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003622 PyObject **exceptionObject,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00003623 int *known_errorHandler, PyObject **errorHandler, const char *errors,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003624 PyObject **res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003625{
3626 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003627 Py_ssize_t repsize;
3628 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003629 Py_UNICODE *uni2;
3630 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003631 Py_ssize_t collstartpos = *inpos;
3632 Py_ssize_t collendpos = *inpos+1;
3633 Py_ssize_t collpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003634 char *encoding = "charmap";
3635 char *reason = "character maps to <undefined>";
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003636 charmapencode_result x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003637
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003638 /* find all unencodable characters */
3639 while (collendpos < size) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003640 PyObject *rep;
3641 if (mapping->ob_type == &EncodingMapType) {
3642 int res = encoding_map_lookup(p[collendpos], mapping);
3643 if (res != -1)
3644 break;
3645 ++collendpos;
3646 continue;
3647 }
3648
3649 rep = charmapencode_lookup(p[collendpos], mapping);
3650 if (rep==NULL)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003651 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003652 else if (rep!=Py_None) {
3653 Py_DECREF(rep);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003654 break;
3655 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003656 Py_DECREF(rep);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003657 ++collendpos;
3658 }
3659 /* cache callback name lookup
3660 * (if not done yet, i.e. it's the first error) */
3661 if (*known_errorHandler==-1) {
3662 if ((errors==NULL) || (!strcmp(errors, "strict")))
3663 *known_errorHandler = 1;
3664 else if (!strcmp(errors, "replace"))
3665 *known_errorHandler = 2;
3666 else if (!strcmp(errors, "ignore"))
3667 *known_errorHandler = 3;
3668 else if (!strcmp(errors, "xmlcharrefreplace"))
3669 *known_errorHandler = 4;
3670 else
3671 *known_errorHandler = 0;
3672 }
3673 switch (*known_errorHandler) {
3674 case 1: /* strict */
3675 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
3676 return -1;
3677 case 2: /* replace */
3678 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
3679 x = charmapencode_output('?', mapping, res, respos);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003680 if (x==enc_EXCEPTION) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003681 return -1;
3682 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003683 else if (x==enc_FAILED) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003684 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
3685 return -1;
3686 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003687 }
3688 /* fall through */
3689 case 3: /* ignore */
3690 *inpos = collendpos;
3691 break;
3692 case 4: /* xmlcharrefreplace */
3693 /* generate replacement (temporarily (mis)uses p) */
3694 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
3695 char buffer[2+29+1+1];
3696 char *cp;
3697 sprintf(buffer, "&#%d;", (int)p[collpos]);
3698 for (cp = buffer; *cp; ++cp) {
3699 x = charmapencode_output(*cp, mapping, res, respos);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003700 if (x==enc_EXCEPTION)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003701 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003702 else if (x==enc_FAILED) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003703 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
3704 return -1;
3705 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003706 }
3707 }
3708 *inpos = collendpos;
3709 break;
3710 default:
Walter Dörwalde5402fb2003-08-14 20:25:29 +00003711 repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003712 encoding, reason, p, size, exceptionObject,
3713 collstartpos, collendpos, &newpos);
3714 if (repunicode == NULL)
3715 return -1;
3716 /* generate replacement */
3717 repsize = PyUnicode_GET_SIZE(repunicode);
3718 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
3719 x = charmapencode_output(*uni2, mapping, res, respos);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003720 if (x==enc_EXCEPTION) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003721 return -1;
3722 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003723 else if (x==enc_FAILED) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003724 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003725 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
3726 return -1;
3727 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003728 }
3729 *inpos = newpos;
3730 Py_DECREF(repunicode);
3731 }
3732 return 0;
3733}
3734
Guido van Rossumd57fd912000-03-10 22:53:23 +00003735PyObject *PyUnicode_EncodeCharmap(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003736 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003737 PyObject *mapping,
3738 const char *errors)
3739{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003740 /* output object */
3741 PyObject *res = NULL;
3742 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003743 Py_ssize_t inpos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003744 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003745 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003746 PyObject *errorHandler = NULL;
3747 PyObject *exc = NULL;
3748 /* the following variable is used for caching string comparisons
3749 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
3750 * 3=ignore, 4=xmlcharrefreplace */
3751 int known_errorHandler = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003752
3753 /* Default to Latin-1 */
3754 if (mapping == NULL)
3755 return PyUnicode_EncodeLatin1(p, size, errors);
3756
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003757 /* allocate enough for a simple encoding without
3758 replacements, if we need more, we'll resize */
3759 res = PyString_FromStringAndSize(NULL, size);
3760 if (res == NULL)
3761 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00003762 if (size == 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003763 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003764
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003765 while (inpos<size) {
3766 /* try to encode it */
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003767 charmapencode_result x = charmapencode_output(p[inpos], mapping, &res, &respos);
3768 if (x==enc_EXCEPTION) /* error */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003769 goto onError;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003770 if (x==enc_FAILED) { /* unencodable character */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003771 if (charmap_encoding_error(p, size, &inpos, mapping,
3772 &exc,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00003773 &known_errorHandler, &errorHandler, errors,
Walter Dörwald9b30f202003-08-15 16:26:34 +00003774 &res, &respos)) {
Marc-André Lemburg3a645e42001-01-16 11:54:12 +00003775 goto onError;
Walter Dörwald9b30f202003-08-15 16:26:34 +00003776 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003777 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003778 else
3779 /* done with this character => adjust input position */
3780 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003781 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003782
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003783 /* Resize if we allocated to much */
3784 if (respos<PyString_GET_SIZE(res)) {
3785 if (_PyString_Resize(&res, respos))
3786 goto onError;
3787 }
3788 Py_XDECREF(exc);
3789 Py_XDECREF(errorHandler);
3790 return res;
3791
3792 onError:
3793 Py_XDECREF(res);
3794 Py_XDECREF(exc);
3795 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003796 return NULL;
3797}
3798
3799PyObject *PyUnicode_AsCharmapString(PyObject *unicode,
3800 PyObject *mapping)
3801{
3802 if (!PyUnicode_Check(unicode) || mapping == NULL) {
3803 PyErr_BadArgument();
3804 return NULL;
3805 }
3806 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
3807 PyUnicode_GET_SIZE(unicode),
3808 mapping,
3809 NULL);
3810}
3811
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003812/* create or adjust a UnicodeTranslateError */
3813static void make_translate_exception(PyObject **exceptionObject,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003814 const Py_UNICODE *unicode, Py_ssize_t size,
3815 Py_ssize_t startpos, Py_ssize_t endpos,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003816 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003817{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003818 if (*exceptionObject == NULL) {
3819 *exceptionObject = PyUnicodeTranslateError_Create(
3820 unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003821 }
3822 else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003823 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
3824 goto onError;
3825 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
3826 goto onError;
3827 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
3828 goto onError;
3829 return;
3830 onError:
3831 Py_DECREF(*exceptionObject);
3832 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003833 }
3834}
3835
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003836/* raises a UnicodeTranslateError */
3837static void raise_translate_exception(PyObject **exceptionObject,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003838 const Py_UNICODE *unicode, Py_ssize_t size,
3839 Py_ssize_t startpos, Py_ssize_t endpos,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003840 const char *reason)
3841{
3842 make_translate_exception(exceptionObject,
3843 unicode, size, startpos, endpos, reason);
3844 if (*exceptionObject != NULL)
3845 PyCodec_StrictErrors(*exceptionObject);
3846}
3847
3848/* error handling callback helper:
3849 build arguments, call the callback and check the arguments,
3850 put the result into newpos and return the replacement string, which
3851 has to be freed by the caller */
3852static PyObject *unicode_translate_call_errorhandler(const char *errors,
3853 PyObject **errorHandler,
3854 const char *reason,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003855 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
3856 Py_ssize_t startpos, Py_ssize_t endpos,
3857 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003858{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003859 static char *argparse = "O!n;translating error handler must return (unicode, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003860
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003861 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003862 PyObject *restuple;
3863 PyObject *resunicode;
3864
3865 if (*errorHandler == NULL) {
3866 *errorHandler = PyCodec_LookupError(errors);
3867 if (*errorHandler == NULL)
3868 return NULL;
3869 }
3870
3871 make_translate_exception(exceptionObject,
3872 unicode, size, startpos, endpos, reason);
3873 if (*exceptionObject == NULL)
3874 return NULL;
3875
3876 restuple = PyObject_CallFunctionObjArgs(
3877 *errorHandler, *exceptionObject, NULL);
3878 if (restuple == NULL)
3879 return NULL;
3880 if (!PyTuple_Check(restuple)) {
3881 PyErr_Format(PyExc_TypeError, &argparse[4]);
3882 Py_DECREF(restuple);
3883 return NULL;
3884 }
3885 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003886 &resunicode, &i_newpos)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003887 Py_DECREF(restuple);
3888 return NULL;
3889 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00003890 if (i_newpos<0)
3891 *newpos = size+i_newpos;
3892 else
3893 *newpos = i_newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003894 if (*newpos<0 || *newpos>size) {
Martin v. Löwis2c95cc62006-02-16 06:54:25 +00003895 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003896 Py_DECREF(restuple);
3897 return NULL;
3898 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003899 Py_INCREF(resunicode);
3900 Py_DECREF(restuple);
3901 return resunicode;
3902}
3903
3904/* Lookup the character ch in the mapping and put the result in result,
3905 which must be decrefed by the caller.
3906 Return 0 on success, -1 on error */
3907static
3908int charmaptranslate_lookup(Py_UNICODE c, PyObject *mapping, PyObject **result)
3909{
3910 PyObject *w = PyInt_FromLong((long)c);
3911 PyObject *x;
3912
3913 if (w == NULL)
3914 return -1;
3915 x = PyObject_GetItem(mapping, w);
3916 Py_DECREF(w);
3917 if (x == NULL) {
3918 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
3919 /* No mapping found means: use 1:1 mapping. */
3920 PyErr_Clear();
3921 *result = NULL;
3922 return 0;
3923 } else
3924 return -1;
3925 }
3926 else if (x == Py_None) {
3927 *result = x;
3928 return 0;
3929 }
3930 else if (PyInt_Check(x)) {
3931 long value = PyInt_AS_LONG(x);
3932 long max = PyUnicode_GetMax();
3933 if (value < 0 || value > max) {
3934 PyErr_Format(PyExc_TypeError,
3935 "character mapping must be in range(0x%lx)", max+1);
3936 Py_DECREF(x);
3937 return -1;
3938 }
3939 *result = x;
3940 return 0;
3941 }
3942 else if (PyUnicode_Check(x)) {
3943 *result = x;
3944 return 0;
3945 }
3946 else {
3947 /* wrong return value */
3948 PyErr_SetString(PyExc_TypeError,
3949 "character mapping must return integer, None or unicode");
Walter Dörwald150523e2003-08-15 16:52:19 +00003950 Py_DECREF(x);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003951 return -1;
3952 }
3953}
3954/* ensure that *outobj is at least requiredsize characters long,
3955if not reallocate and adjust various state variables.
3956Return 0 on success, -1 on error */
3957static
Walter Dörwald4894c302003-10-24 14:25:28 +00003958int charmaptranslate_makespace(PyObject **outobj, Py_UNICODE **outp,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003959 Py_ssize_t requiredsize)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003960{
Martin v. Löwis18e16552006-02-15 17:27:45 +00003961 Py_ssize_t oldsize = PyUnicode_GET_SIZE(*outobj);
Walter Dörwald4894c302003-10-24 14:25:28 +00003962 if (requiredsize > oldsize) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003963 /* remember old output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003964 Py_ssize_t outpos = *outp-PyUnicode_AS_UNICODE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003965 /* exponentially overallocate to minimize reallocations */
Walter Dörwald4894c302003-10-24 14:25:28 +00003966 if (requiredsize < 2 * oldsize)
3967 requiredsize = 2 * oldsize;
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00003968 if (_PyUnicode_Resize(outobj, requiredsize) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003969 return -1;
3970 *outp = PyUnicode_AS_UNICODE(*outobj) + outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003971 }
3972 return 0;
3973}
3974/* lookup the character, put the result in the output string and adjust
3975 various state variables. Return a new reference to the object that
3976 was put in the output buffer in *result, or Py_None, if the mapping was
3977 undefined (in which case no character was written).
3978 The called must decref result.
3979 Return 0 on success, -1 on error. */
3980static
Walter Dörwald4894c302003-10-24 14:25:28 +00003981int charmaptranslate_output(const Py_UNICODE *startinp, const Py_UNICODE *curinp,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003982 Py_ssize_t insize, PyObject *mapping, PyObject **outobj, Py_UNICODE **outp,
Walter Dörwald4894c302003-10-24 14:25:28 +00003983 PyObject **res)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003984{
Walter Dörwald4894c302003-10-24 14:25:28 +00003985 if (charmaptranslate_lookup(*curinp, mapping, res))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003986 return -1;
3987 if (*res==NULL) {
3988 /* not found => default to 1:1 mapping */
Walter Dörwald4894c302003-10-24 14:25:28 +00003989 *(*outp)++ = *curinp;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003990 }
3991 else if (*res==Py_None)
3992 ;
3993 else if (PyInt_Check(*res)) {
3994 /* no overflow check, because we know that the space is enough */
3995 *(*outp)++ = (Py_UNICODE)PyInt_AS_LONG(*res);
3996 }
3997 else if (PyUnicode_Check(*res)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00003998 Py_ssize_t repsize = PyUnicode_GET_SIZE(*res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003999 if (repsize==1) {
4000 /* no overflow check, because we know that the space is enough */
4001 *(*outp)++ = *PyUnicode_AS_UNICODE(*res);
4002 }
4003 else if (repsize!=0) {
4004 /* more than one character */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004005 Py_ssize_t requiredsize = (*outp-PyUnicode_AS_UNICODE(*outobj)) +
Walter Dörwaldcd736e72004-02-05 17:36:00 +00004006 (insize - (curinp-startinp)) +
Walter Dörwald4894c302003-10-24 14:25:28 +00004007 repsize - 1;
4008 if (charmaptranslate_makespace(outobj, outp, requiredsize))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004009 return -1;
4010 memcpy(*outp, PyUnicode_AS_UNICODE(*res), sizeof(Py_UNICODE)*repsize);
4011 *outp += repsize;
4012 }
4013 }
4014 else
4015 return -1;
4016 return 0;
4017}
4018
4019PyObject *PyUnicode_TranslateCharmap(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004020 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004021 PyObject *mapping,
4022 const char *errors)
4023{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004024 /* output object */
4025 PyObject *res = NULL;
4026 /* pointers to the beginning and end+1 of input */
4027 const Py_UNICODE *startp = p;
4028 const Py_UNICODE *endp = p + size;
4029 /* pointer into the output */
4030 Py_UNICODE *str;
4031 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004032 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004033 char *reason = "character maps to <undefined>";
4034 PyObject *errorHandler = NULL;
4035 PyObject *exc = NULL;
4036 /* the following variable is used for caching string comparisons
4037 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
4038 * 3=ignore, 4=xmlcharrefreplace */
4039 int known_errorHandler = -1;
4040
Guido van Rossumd57fd912000-03-10 22:53:23 +00004041 if (mapping == NULL) {
4042 PyErr_BadArgument();
4043 return NULL;
4044 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004045
4046 /* allocate enough for a simple 1:1 translation without
4047 replacements, if we need more, we'll resize */
4048 res = PyUnicode_FromUnicode(NULL, size);
4049 if (res == NULL)
Walter Dörwald4894c302003-10-24 14:25:28 +00004050 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004051 if (size == 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004052 return res;
4053 str = PyUnicode_AS_UNICODE(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004054
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004055 while (p<endp) {
4056 /* try to encode it */
4057 PyObject *x = NULL;
Walter Dörwald4894c302003-10-24 14:25:28 +00004058 if (charmaptranslate_output(startp, p, size, mapping, &res, &str, &x)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004059 Py_XDECREF(x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004060 goto onError;
4061 }
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00004062 Py_XDECREF(x);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004063 if (x!=Py_None) /* it worked => adjust input pointer */
4064 ++p;
4065 else { /* untranslatable character */
4066 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004067 Py_ssize_t repsize;
4068 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004069 Py_UNICODE *uni2;
4070 /* startpos for collecting untranslatable chars */
4071 const Py_UNICODE *collstart = p;
4072 const Py_UNICODE *collend = p+1;
4073 const Py_UNICODE *coll;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004074
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004075 /* find all untranslatable characters */
4076 while (collend < endp) {
Walter Dörwald4894c302003-10-24 14:25:28 +00004077 if (charmaptranslate_lookup(*collend, mapping, &x))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004078 goto onError;
4079 Py_XDECREF(x);
4080 if (x!=Py_None)
4081 break;
4082 ++collend;
4083 }
4084 /* cache callback name lookup
4085 * (if not done yet, i.e. it's the first error) */
4086 if (known_errorHandler==-1) {
4087 if ((errors==NULL) || (!strcmp(errors, "strict")))
4088 known_errorHandler = 1;
4089 else if (!strcmp(errors, "replace"))
4090 known_errorHandler = 2;
4091 else if (!strcmp(errors, "ignore"))
4092 known_errorHandler = 3;
4093 else if (!strcmp(errors, "xmlcharrefreplace"))
4094 known_errorHandler = 4;
4095 else
4096 known_errorHandler = 0;
4097 }
4098 switch (known_errorHandler) {
4099 case 1: /* strict */
4100 raise_translate_exception(&exc, startp, size, collstart-startp, collend-startp, reason);
4101 goto onError;
4102 case 2: /* replace */
4103 /* No need to check for space, this is a 1:1 replacement */
4104 for (coll = collstart; coll<collend; ++coll)
4105 *str++ = '?';
4106 /* fall through */
4107 case 3: /* ignore */
4108 p = collend;
4109 break;
4110 case 4: /* xmlcharrefreplace */
4111 /* generate replacement (temporarily (mis)uses p) */
4112 for (p = collstart; p < collend; ++p) {
4113 char buffer[2+29+1+1];
4114 char *cp;
4115 sprintf(buffer, "&#%d;", (int)*p);
Walter Dörwald4894c302003-10-24 14:25:28 +00004116 if (charmaptranslate_makespace(&res, &str,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004117 (str-PyUnicode_AS_UNICODE(res))+strlen(buffer)+(endp-collend)))
4118 goto onError;
4119 for (cp = buffer; *cp; ++cp)
4120 *str++ = *cp;
4121 }
4122 p = collend;
4123 break;
4124 default:
4125 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
4126 reason, startp, size, &exc,
4127 collstart-startp, collend-startp, &newpos);
4128 if (repunicode == NULL)
4129 goto onError;
4130 /* generate replacement */
4131 repsize = PyUnicode_GET_SIZE(repunicode);
Walter Dörwald4894c302003-10-24 14:25:28 +00004132 if (charmaptranslate_makespace(&res, &str,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004133 (str-PyUnicode_AS_UNICODE(res))+repsize+(endp-collend))) {
4134 Py_DECREF(repunicode);
4135 goto onError;
4136 }
4137 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2)
4138 *str++ = *uni2;
4139 p = startp + newpos;
4140 Py_DECREF(repunicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004141 }
4142 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004143 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004144 /* Resize if we allocated to much */
4145 respos = str-PyUnicode_AS_UNICODE(res);
Walter Dörwald4894c302003-10-24 14:25:28 +00004146 if (respos<PyUnicode_GET_SIZE(res)) {
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00004147 if (_PyUnicode_Resize(&res, respos) < 0)
Guido van Rossumfd4b9572000-04-10 13:51:10 +00004148 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004149 }
4150 Py_XDECREF(exc);
4151 Py_XDECREF(errorHandler);
4152 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004153
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004154 onError:
4155 Py_XDECREF(res);
4156 Py_XDECREF(exc);
4157 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004158 return NULL;
4159}
4160
4161PyObject *PyUnicode_Translate(PyObject *str,
4162 PyObject *mapping,
4163 const char *errors)
4164{
4165 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00004166
Guido van Rossumd57fd912000-03-10 22:53:23 +00004167 str = PyUnicode_FromObject(str);
4168 if (str == NULL)
4169 goto onError;
4170 result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str),
4171 PyUnicode_GET_SIZE(str),
4172 mapping,
4173 errors);
4174 Py_DECREF(str);
4175 return result;
Tim Petersced69f82003-09-16 20:30:58 +00004176
Guido van Rossumd57fd912000-03-10 22:53:23 +00004177 onError:
4178 Py_XDECREF(str);
4179 return NULL;
4180}
Tim Petersced69f82003-09-16 20:30:58 +00004181
Guido van Rossum9e896b32000-04-05 20:11:21 +00004182/* --- Decimal Encoder ---------------------------------------------------- */
4183
4184int PyUnicode_EncodeDecimal(Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004185 Py_ssize_t length,
Guido van Rossum9e896b32000-04-05 20:11:21 +00004186 char *output,
4187 const char *errors)
4188{
4189 Py_UNICODE *p, *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004190 PyObject *errorHandler = NULL;
4191 PyObject *exc = NULL;
4192 const char *encoding = "decimal";
4193 const char *reason = "invalid decimal Unicode string";
4194 /* the following variable is used for caching string comparisons
4195 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
4196 int known_errorHandler = -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00004197
4198 if (output == NULL) {
4199 PyErr_BadArgument();
4200 return -1;
4201 }
4202
4203 p = s;
4204 end = s + length;
4205 while (p < end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004206 register Py_UNICODE ch = *p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00004207 int decimal;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004208 PyObject *repunicode;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004209 Py_ssize_t repsize;
4210 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004211 Py_UNICODE *uni2;
4212 Py_UNICODE *collstart;
4213 Py_UNICODE *collend;
Tim Petersced69f82003-09-16 20:30:58 +00004214
Guido van Rossum9e896b32000-04-05 20:11:21 +00004215 if (Py_UNICODE_ISSPACE(ch)) {
4216 *output++ = ' ';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004217 ++p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00004218 continue;
4219 }
4220 decimal = Py_UNICODE_TODECIMAL(ch);
4221 if (decimal >= 0) {
4222 *output++ = '0' + decimal;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004223 ++p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00004224 continue;
4225 }
Guido van Rossumba477042000-04-06 18:18:10 +00004226 if (0 < ch && ch < 256) {
Guido van Rossum42c29aa2000-05-03 23:58:29 +00004227 *output++ = (char)ch;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004228 ++p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00004229 continue;
4230 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004231 /* All other characters are considered unencodable */
4232 collstart = p;
4233 collend = p+1;
4234 while (collend < end) {
4235 if ((0 < *collend && *collend < 256) ||
4236 !Py_UNICODE_ISSPACE(*collend) ||
4237 Py_UNICODE_TODECIMAL(*collend))
4238 break;
Guido van Rossum9e896b32000-04-05 20:11:21 +00004239 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004240 /* cache callback name lookup
4241 * (if not done yet, i.e. it's the first error) */
4242 if (known_errorHandler==-1) {
4243 if ((errors==NULL) || (!strcmp(errors, "strict")))
4244 known_errorHandler = 1;
4245 else if (!strcmp(errors, "replace"))
4246 known_errorHandler = 2;
4247 else if (!strcmp(errors, "ignore"))
4248 known_errorHandler = 3;
4249 else if (!strcmp(errors, "xmlcharrefreplace"))
4250 known_errorHandler = 4;
4251 else
4252 known_errorHandler = 0;
4253 }
4254 switch (known_errorHandler) {
4255 case 1: /* strict */
4256 raise_encode_exception(&exc, encoding, s, length, collstart-s, collend-s, reason);
4257 goto onError;
4258 case 2: /* replace */
4259 for (p = collstart; p < collend; ++p)
4260 *output++ = '?';
4261 /* fall through */
4262 case 3: /* ignore */
4263 p = collend;
4264 break;
4265 case 4: /* xmlcharrefreplace */
4266 /* generate replacement (temporarily (mis)uses p) */
4267 for (p = collstart; p < collend; ++p)
4268 output += sprintf(output, "&#%d;", (int)*p);
4269 p = collend;
4270 break;
4271 default:
4272 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
4273 encoding, reason, s, length, &exc,
4274 collstart-s, collend-s, &newpos);
4275 if (repunicode == NULL)
4276 goto onError;
4277 /* generate replacement */
4278 repsize = PyUnicode_GET_SIZE(repunicode);
4279 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
4280 Py_UNICODE ch = *uni2;
4281 if (Py_UNICODE_ISSPACE(ch))
4282 *output++ = ' ';
4283 else {
4284 decimal = Py_UNICODE_TODECIMAL(ch);
4285 if (decimal >= 0)
4286 *output++ = '0' + decimal;
4287 else if (0 < ch && ch < 256)
4288 *output++ = (char)ch;
4289 else {
4290 Py_DECREF(repunicode);
4291 raise_encode_exception(&exc, encoding,
4292 s, length, collstart-s, collend-s, reason);
4293 goto onError;
4294 }
4295 }
4296 }
4297 p = s + newpos;
4298 Py_DECREF(repunicode);
Guido van Rossum9e896b32000-04-05 20:11:21 +00004299 }
4300 }
4301 /* 0-terminate the output string */
4302 *output++ = '\0';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004303 Py_XDECREF(exc);
4304 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00004305 return 0;
4306
4307 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004308 Py_XDECREF(exc);
4309 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00004310 return -1;
4311}
4312
Guido van Rossumd57fd912000-03-10 22:53:23 +00004313/* --- Helpers ------------------------------------------------------------ */
4314
Thomas Wouters477c8d52006-05-27 19:21:47 +00004315#define STRINGLIB_CHAR Py_UNICODE
4316
4317#define STRINGLIB_LEN PyUnicode_GET_SIZE
4318#define STRINGLIB_NEW PyUnicode_FromUnicode
4319#define STRINGLIB_STR PyUnicode_AS_UNICODE
4320
4321Py_LOCAL_INLINE(int)
4322STRINGLIB_CMP(const Py_UNICODE* str, const Py_UNICODE* other, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004323{
Thomas Wouters477c8d52006-05-27 19:21:47 +00004324 if (str[0] != other[0])
4325 return 1;
4326 return memcmp((void*) str, (void*) other, len * sizeof(Py_UNICODE));
Guido van Rossumd57fd912000-03-10 22:53:23 +00004327}
4328
Thomas Wouters477c8d52006-05-27 19:21:47 +00004329#define STRINGLIB_EMPTY unicode_empty
4330
4331#include "stringlib/fastsearch.h"
4332
4333#include "stringlib/count.h"
4334#include "stringlib/find.h"
4335#include "stringlib/partition.h"
4336
4337/* helper macro to fixup start/end slice values */
4338#define FIX_START_END(obj) \
4339 if (start < 0) \
4340 start += (obj)->length; \
4341 if (start < 0) \
4342 start = 0; \
4343 if (end > (obj)->length) \
4344 end = (obj)->length; \
4345 if (end < 0) \
4346 end += (obj)->length; \
4347 if (end < 0) \
4348 end = 0;
4349
Martin v. Löwis18e16552006-02-15 17:27:45 +00004350Py_ssize_t PyUnicode_Count(PyObject *str,
Thomas Wouters477c8d52006-05-27 19:21:47 +00004351 PyObject *substr,
4352 Py_ssize_t start,
4353 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004354{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004355 Py_ssize_t result;
Thomas Wouters477c8d52006-05-27 19:21:47 +00004356 PyUnicodeObject* str_obj;
4357 PyUnicodeObject* sub_obj;
Tim Petersced69f82003-09-16 20:30:58 +00004358
Thomas Wouters477c8d52006-05-27 19:21:47 +00004359 str_obj = (PyUnicodeObject*) PyUnicode_FromObject(str);
4360 if (!str_obj)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004361 return -1;
Thomas Wouters477c8d52006-05-27 19:21:47 +00004362 sub_obj = (PyUnicodeObject*) PyUnicode_FromObject(substr);
4363 if (!sub_obj) {
4364 Py_DECREF(str_obj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004365 return -1;
4366 }
Tim Petersced69f82003-09-16 20:30:58 +00004367
Thomas Wouters477c8d52006-05-27 19:21:47 +00004368 FIX_START_END(str_obj);
Tim Petersced69f82003-09-16 20:30:58 +00004369
Thomas Wouters477c8d52006-05-27 19:21:47 +00004370 result = stringlib_count(
4371 str_obj->str + start, end - start, sub_obj->str, sub_obj->length
4372 );
4373
4374 Py_DECREF(sub_obj);
4375 Py_DECREF(str_obj);
4376
Guido van Rossumd57fd912000-03-10 22:53:23 +00004377 return result;
4378}
4379
Martin v. Löwis18e16552006-02-15 17:27:45 +00004380Py_ssize_t PyUnicode_Find(PyObject *str,
Thomas Wouters477c8d52006-05-27 19:21:47 +00004381 PyObject *sub,
4382 Py_ssize_t start,
4383 Py_ssize_t end,
4384 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004385{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004386 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00004387
Guido van Rossumd57fd912000-03-10 22:53:23 +00004388 str = PyUnicode_FromObject(str);
Thomas Wouters477c8d52006-05-27 19:21:47 +00004389 if (!str)
Marc-André Lemburg4da6fd62002-05-29 11:33:13 +00004390 return -2;
Thomas Wouters477c8d52006-05-27 19:21:47 +00004391 sub = PyUnicode_FromObject(sub);
4392 if (!sub) {
Marc-André Lemburg41644392002-05-29 13:46:29 +00004393 Py_DECREF(str);
Marc-André Lemburg4da6fd62002-05-29 11:33:13 +00004394 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004395 }
Tim Petersced69f82003-09-16 20:30:58 +00004396
Thomas Wouters477c8d52006-05-27 19:21:47 +00004397 if (direction > 0)
4398 result = stringlib_find_slice(
4399 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
4400 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
4401 start, end
4402 );
4403 else
4404 result = stringlib_rfind_slice(
4405 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
4406 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
4407 start, end
4408 );
4409
Guido van Rossumd57fd912000-03-10 22:53:23 +00004410 Py_DECREF(str);
Thomas Wouters477c8d52006-05-27 19:21:47 +00004411 Py_DECREF(sub);
4412
Guido van Rossumd57fd912000-03-10 22:53:23 +00004413 return result;
4414}
4415
Tim Petersced69f82003-09-16 20:30:58 +00004416static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004417int tailmatch(PyUnicodeObject *self,
4418 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004419 Py_ssize_t start,
4420 Py_ssize_t end,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004421 int direction)
4422{
Guido van Rossumd57fd912000-03-10 22:53:23 +00004423 if (substring->length == 0)
4424 return 1;
4425
Thomas Wouters477c8d52006-05-27 19:21:47 +00004426 FIX_START_END(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004427
4428 end -= substring->length;
4429 if (end < start)
4430 return 0;
4431
4432 if (direction > 0) {
4433 if (Py_UNICODE_MATCH(self, end, substring))
4434 return 1;
4435 } else {
4436 if (Py_UNICODE_MATCH(self, start, substring))
4437 return 1;
4438 }
4439
4440 return 0;
4441}
4442
Martin v. Löwis18e16552006-02-15 17:27:45 +00004443Py_ssize_t PyUnicode_Tailmatch(PyObject *str,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004444 PyObject *substr,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004445 Py_ssize_t start,
4446 Py_ssize_t end,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004447 int direction)
4448{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004449 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00004450
Guido van Rossumd57fd912000-03-10 22:53:23 +00004451 str = PyUnicode_FromObject(str);
4452 if (str == NULL)
4453 return -1;
4454 substr = PyUnicode_FromObject(substr);
4455 if (substr == NULL) {
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00004456 Py_DECREF(str);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004457 return -1;
4458 }
Tim Petersced69f82003-09-16 20:30:58 +00004459
Guido van Rossumd57fd912000-03-10 22:53:23 +00004460 result = tailmatch((PyUnicodeObject *)str,
4461 (PyUnicodeObject *)substr,
4462 start, end, direction);
4463 Py_DECREF(str);
4464 Py_DECREF(substr);
4465 return result;
4466}
4467
Guido van Rossumd57fd912000-03-10 22:53:23 +00004468/* Apply fixfct filter to the Unicode object self and return a
4469 reference to the modified object */
4470
Tim Petersced69f82003-09-16 20:30:58 +00004471static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004472PyObject *fixup(PyUnicodeObject *self,
4473 int (*fixfct)(PyUnicodeObject *s))
4474{
4475
4476 PyUnicodeObject *u;
4477
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004478 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004479 if (u == NULL)
4480 return NULL;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004481
4482 Py_UNICODE_COPY(u->str, self->str, self->length);
4483
Tim Peters7a29bd52001-09-12 03:03:31 +00004484 if (!fixfct(u) && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004485 /* fixfct should return TRUE if it modified the buffer. If
4486 FALSE, return a reference to the original buffer instead
4487 (to save space, not time) */
4488 Py_INCREF(self);
4489 Py_DECREF(u);
4490 return (PyObject*) self;
4491 }
4492 return (PyObject*) u;
4493}
4494
Tim Petersced69f82003-09-16 20:30:58 +00004495static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004496int fixupper(PyUnicodeObject *self)
4497{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004498 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004499 Py_UNICODE *s = self->str;
4500 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00004501
Guido van Rossumd57fd912000-03-10 22:53:23 +00004502 while (len-- > 0) {
4503 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00004504
Guido van Rossumd57fd912000-03-10 22:53:23 +00004505 ch = Py_UNICODE_TOUPPER(*s);
4506 if (ch != *s) {
4507 status = 1;
4508 *s = ch;
4509 }
4510 s++;
4511 }
4512
4513 return status;
4514}
4515
Tim Petersced69f82003-09-16 20:30:58 +00004516static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004517int fixlower(PyUnicodeObject *self)
4518{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004519 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004520 Py_UNICODE *s = self->str;
4521 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00004522
Guido van Rossumd57fd912000-03-10 22:53:23 +00004523 while (len-- > 0) {
4524 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00004525
Guido van Rossumd57fd912000-03-10 22:53:23 +00004526 ch = Py_UNICODE_TOLOWER(*s);
4527 if (ch != *s) {
4528 status = 1;
4529 *s = ch;
4530 }
4531 s++;
4532 }
4533
4534 return status;
4535}
4536
Tim Petersced69f82003-09-16 20:30:58 +00004537static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004538int fixswapcase(PyUnicodeObject *self)
4539{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004540 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004541 Py_UNICODE *s = self->str;
4542 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00004543
Guido van Rossumd57fd912000-03-10 22:53:23 +00004544 while (len-- > 0) {
4545 if (Py_UNICODE_ISUPPER(*s)) {
4546 *s = Py_UNICODE_TOLOWER(*s);
4547 status = 1;
4548 } else if (Py_UNICODE_ISLOWER(*s)) {
4549 *s = Py_UNICODE_TOUPPER(*s);
4550 status = 1;
4551 }
4552 s++;
4553 }
4554
4555 return status;
4556}
4557
Tim Petersced69f82003-09-16 20:30:58 +00004558static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004559int fixcapitalize(PyUnicodeObject *self)
4560{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004561 Py_ssize_t len = self->length;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00004562 Py_UNICODE *s = self->str;
4563 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00004564
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00004565 if (len == 0)
4566 return 0;
4567 if (Py_UNICODE_ISLOWER(*s)) {
4568 *s = Py_UNICODE_TOUPPER(*s);
4569 status = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004570 }
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00004571 s++;
4572 while (--len > 0) {
4573 if (Py_UNICODE_ISUPPER(*s)) {
4574 *s = Py_UNICODE_TOLOWER(*s);
4575 status = 1;
4576 }
4577 s++;
4578 }
4579 return status;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004580}
4581
4582static
4583int fixtitle(PyUnicodeObject *self)
4584{
4585 register Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
4586 register Py_UNICODE *e;
4587 int previous_is_cased;
4588
4589 /* Shortcut for single character strings */
4590 if (PyUnicode_GET_SIZE(self) == 1) {
4591 Py_UNICODE ch = Py_UNICODE_TOTITLE(*p);
4592 if (*p != ch) {
4593 *p = ch;
4594 return 1;
4595 }
4596 else
4597 return 0;
4598 }
Tim Petersced69f82003-09-16 20:30:58 +00004599
Guido van Rossumd57fd912000-03-10 22:53:23 +00004600 e = p + PyUnicode_GET_SIZE(self);
4601 previous_is_cased = 0;
4602 for (; p < e; p++) {
4603 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00004604
Guido van Rossumd57fd912000-03-10 22:53:23 +00004605 if (previous_is_cased)
4606 *p = Py_UNICODE_TOLOWER(ch);
4607 else
4608 *p = Py_UNICODE_TOTITLE(ch);
Tim Petersced69f82003-09-16 20:30:58 +00004609
4610 if (Py_UNICODE_ISLOWER(ch) ||
4611 Py_UNICODE_ISUPPER(ch) ||
Guido van Rossumd57fd912000-03-10 22:53:23 +00004612 Py_UNICODE_ISTITLE(ch))
4613 previous_is_cased = 1;
4614 else
4615 previous_is_cased = 0;
4616 }
4617 return 1;
4618}
4619
Tim Peters8ce9f162004-08-27 01:49:32 +00004620PyObject *
4621PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004622{
Tim Peters8ce9f162004-08-27 01:49:32 +00004623 PyObject *internal_separator = NULL;
Skip Montanaro6543b452004-09-16 03:28:13 +00004624 const Py_UNICODE blank = ' ';
4625 const Py_UNICODE *sep = &blank;
Thomas Wouters477c8d52006-05-27 19:21:47 +00004626 Py_ssize_t seplen = 1;
Tim Peters05eba1f2004-08-27 21:32:02 +00004627 PyUnicodeObject *res = NULL; /* the result */
Thomas Wouters477c8d52006-05-27 19:21:47 +00004628 Py_ssize_t res_alloc = 100; /* # allocated bytes for string in res */
4629 Py_ssize_t res_used; /* # used bytes */
Tim Peters05eba1f2004-08-27 21:32:02 +00004630 Py_UNICODE *res_p; /* pointer to free byte in res's string area */
4631 PyObject *fseq; /* PySequence_Fast(seq) */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004632 Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */
Tim Peters8ce9f162004-08-27 01:49:32 +00004633 PyObject *item;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004634 Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004635
Tim Peters05eba1f2004-08-27 21:32:02 +00004636 fseq = PySequence_Fast(seq, "");
4637 if (fseq == NULL) {
Tim Peters05eba1f2004-08-27 21:32:02 +00004638 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +00004639 }
4640
Tim Peters91879ab2004-08-27 22:35:44 +00004641 /* Grrrr. A codec may be invoked to convert str objects to
4642 * Unicode, and so it's possible to call back into Python code
4643 * during PyUnicode_FromObject(), and so it's possible for a sick
4644 * codec to change the size of fseq (if seq is a list). Therefore
4645 * we have to keep refetching the size -- can't assume seqlen
4646 * is invariant.
4647 */
Tim Peters05eba1f2004-08-27 21:32:02 +00004648 seqlen = PySequence_Fast_GET_SIZE(fseq);
4649 /* If empty sequence, return u"". */
4650 if (seqlen == 0) {
4651 res = _PyUnicode_New(0); /* empty sequence; return u"" */
4652 goto Done;
4653 }
4654 /* If singleton sequence with an exact Unicode, return that. */
4655 if (seqlen == 1) {
4656 item = PySequence_Fast_GET_ITEM(fseq, 0);
4657 if (PyUnicode_CheckExact(item)) {
4658 Py_INCREF(item);
4659 res = (PyUnicodeObject *)item;
4660 goto Done;
4661 }
Tim Peters8ce9f162004-08-27 01:49:32 +00004662 }
4663
Tim Peters05eba1f2004-08-27 21:32:02 +00004664 /* At least two items to join, or one that isn't exact Unicode. */
4665 if (seqlen > 1) {
4666 /* Set up sep and seplen -- they're needed. */
4667 if (separator == NULL) {
4668 sep = &blank;
4669 seplen = 1;
4670 }
4671 else {
4672 internal_separator = PyUnicode_FromObject(separator);
4673 if (internal_separator == NULL)
4674 goto onError;
4675 sep = PyUnicode_AS_UNICODE(internal_separator);
4676 seplen = PyUnicode_GET_SIZE(internal_separator);
Tim Peters91879ab2004-08-27 22:35:44 +00004677 /* In case PyUnicode_FromObject() mutated seq. */
4678 seqlen = PySequence_Fast_GET_SIZE(fseq);
Tim Peters05eba1f2004-08-27 21:32:02 +00004679 }
4680 }
4681
4682 /* Get space. */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004683 res = _PyUnicode_New(res_alloc);
Tim Peters05eba1f2004-08-27 21:32:02 +00004684 if (res == NULL)
Tim Peters8ce9f162004-08-27 01:49:32 +00004685 goto onError;
Tim Peters05eba1f2004-08-27 21:32:02 +00004686 res_p = PyUnicode_AS_UNICODE(res);
4687 res_used = 0;
Tim Peters8ce9f162004-08-27 01:49:32 +00004688
Tim Peters05eba1f2004-08-27 21:32:02 +00004689 for (i = 0; i < seqlen; ++i) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00004690 Py_ssize_t itemlen;
4691 Py_ssize_t new_res_used;
Tim Peters05eba1f2004-08-27 21:32:02 +00004692
4693 item = PySequence_Fast_GET_ITEM(fseq, i);
4694 /* Convert item to Unicode. */
4695 if (! PyUnicode_Check(item) && ! PyString_Check(item)) {
4696 PyErr_Format(PyExc_TypeError,
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004697 "sequence item %zd: expected string or Unicode,"
Tim Peters05eba1f2004-08-27 21:32:02 +00004698 " %.80s found",
4699 i, item->ob_type->tp_name);
Tim Peters2cfe3682001-05-05 05:36:48 +00004700 goto onError;
Tim Peters8ce9f162004-08-27 01:49:32 +00004701 }
Tim Peters05eba1f2004-08-27 21:32:02 +00004702 item = PyUnicode_FromObject(item);
4703 if (item == NULL)
4704 goto onError;
4705 /* We own a reference to item from here on. */
Tim Petersced69f82003-09-16 20:30:58 +00004706
Tim Peters91879ab2004-08-27 22:35:44 +00004707 /* In case PyUnicode_FromObject() mutated seq. */
4708 seqlen = PySequence_Fast_GET_SIZE(fseq);
4709
Tim Peters8ce9f162004-08-27 01:49:32 +00004710 /* Make sure we have enough space for the separator and the item. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004711 itemlen = PyUnicode_GET_SIZE(item);
Tim Peters05eba1f2004-08-27 21:32:02 +00004712 new_res_used = res_used + itemlen;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004713 if (new_res_used < 0)
Tim Peters8ce9f162004-08-27 01:49:32 +00004714 goto Overflow;
Tim Peters05eba1f2004-08-27 21:32:02 +00004715 if (i < seqlen - 1) {
4716 new_res_used += seplen;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004717 if (new_res_used < 0)
Tim Peters05eba1f2004-08-27 21:32:02 +00004718 goto Overflow;
4719 }
4720 if (new_res_used > res_alloc) {
4721 /* double allocated size until it's big enough */
Tim Peters8ce9f162004-08-27 01:49:32 +00004722 do {
Tim Peters05eba1f2004-08-27 21:32:02 +00004723 res_alloc += res_alloc;
Thomas Wouters477c8d52006-05-27 19:21:47 +00004724 if (res_alloc <= 0)
Tim Peters8ce9f162004-08-27 01:49:32 +00004725 goto Overflow;
Tim Peters05eba1f2004-08-27 21:32:02 +00004726 } while (new_res_used > res_alloc);
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004727 if (_PyUnicode_Resize(&res, res_alloc) < 0) {
Marc-André Lemburg3508e302001-09-20 17:22:58 +00004728 Py_DECREF(item);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004729 goto onError;
Marc-André Lemburg3508e302001-09-20 17:22:58 +00004730 }
Tim Peters05eba1f2004-08-27 21:32:02 +00004731 res_p = PyUnicode_AS_UNICODE(res) + res_used;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004732 }
Tim Peters05eba1f2004-08-27 21:32:02 +00004733
4734 /* Copy item, and maybe the separator. */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004735 Py_UNICODE_COPY(res_p, PyUnicode_AS_UNICODE(item), itemlen);
Tim Peters05eba1f2004-08-27 21:32:02 +00004736 res_p += itemlen;
4737 if (i < seqlen - 1) {
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004738 Py_UNICODE_COPY(res_p, sep, seplen);
Tim Peters05eba1f2004-08-27 21:32:02 +00004739 res_p += seplen;
4740 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004741 Py_DECREF(item);
Tim Peters05eba1f2004-08-27 21:32:02 +00004742 res_used = new_res_used;
4743 }
Tim Peters8ce9f162004-08-27 01:49:32 +00004744
Tim Peters05eba1f2004-08-27 21:32:02 +00004745 /* Shrink res to match the used area; this probably can't fail,
4746 * but it's cheap to check.
4747 */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004748 if (_PyUnicode_Resize(&res, res_used) < 0)
Tim Peters8ce9f162004-08-27 01:49:32 +00004749 goto onError;
4750
4751 Done:
4752 Py_XDECREF(internal_separator);
Tim Peters05eba1f2004-08-27 21:32:02 +00004753 Py_DECREF(fseq);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004754 return (PyObject *)res;
4755
Tim Peters8ce9f162004-08-27 01:49:32 +00004756 Overflow:
4757 PyErr_SetString(PyExc_OverflowError,
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004758 "join() result is too long for a Python string");
Tim Peters8ce9f162004-08-27 01:49:32 +00004759 Py_DECREF(item);
4760 /* fall through */
4761
Guido van Rossumd57fd912000-03-10 22:53:23 +00004762 onError:
Tim Peters8ce9f162004-08-27 01:49:32 +00004763 Py_XDECREF(internal_separator);
Tim Peters05eba1f2004-08-27 21:32:02 +00004764 Py_DECREF(fseq);
Tim Peters8ce9f162004-08-27 01:49:32 +00004765 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004766 return NULL;
4767}
4768
Tim Petersced69f82003-09-16 20:30:58 +00004769static
4770PyUnicodeObject *pad(PyUnicodeObject *self,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004771 Py_ssize_t left,
4772 Py_ssize_t right,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004773 Py_UNICODE fill)
4774{
4775 PyUnicodeObject *u;
4776
4777 if (left < 0)
4778 left = 0;
4779 if (right < 0)
4780 right = 0;
4781
Tim Peters7a29bd52001-09-12 03:03:31 +00004782 if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004783 Py_INCREF(self);
4784 return self;
4785 }
4786
4787 u = _PyUnicode_New(left + self->length + right);
4788 if (u) {
4789 if (left)
4790 Py_UNICODE_FILL(u->str, fill, left);
4791 Py_UNICODE_COPY(u->str + left, self->str, self->length);
4792 if (right)
4793 Py_UNICODE_FILL(u->str + left + self->length, fill, right);
4794 }
4795
4796 return u;
4797}
4798
4799#define SPLIT_APPEND(data, left, right) \
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004800 str = PyUnicode_FromUnicode((data) + (left), (right) - (left)); \
Guido van Rossumd57fd912000-03-10 22:53:23 +00004801 if (!str) \
4802 goto onError; \
4803 if (PyList_Append(list, str)) { \
4804 Py_DECREF(str); \
4805 goto onError; \
4806 } \
4807 else \
4808 Py_DECREF(str);
4809
4810static
4811PyObject *split_whitespace(PyUnicodeObject *self,
4812 PyObject *list,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004813 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004814{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004815 register Py_ssize_t i;
4816 register Py_ssize_t j;
4817 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004818 PyObject *str;
4819
4820 for (i = j = 0; i < len; ) {
4821 /* find a token */
4822 while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
4823 i++;
4824 j = i;
4825 while (i < len && !Py_UNICODE_ISSPACE(self->str[i]))
4826 i++;
4827 if (j < i) {
4828 if (maxcount-- <= 0)
4829 break;
4830 SPLIT_APPEND(self->str, j, i);
4831 while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
4832 i++;
4833 j = i;
4834 }
4835 }
4836 if (j < len) {
4837 SPLIT_APPEND(self->str, j, len);
4838 }
4839 return list;
4840
4841 onError:
4842 Py_DECREF(list);
4843 return NULL;
4844}
4845
4846PyObject *PyUnicode_Splitlines(PyObject *string,
Guido van Rossum86662912000-04-11 15:38:46 +00004847 int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004848{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004849 register Py_ssize_t i;
4850 register Py_ssize_t j;
4851 Py_ssize_t len;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004852 PyObject *list;
4853 PyObject *str;
4854 Py_UNICODE *data;
4855
4856 string = PyUnicode_FromObject(string);
4857 if (string == NULL)
4858 return NULL;
4859 data = PyUnicode_AS_UNICODE(string);
4860 len = PyUnicode_GET_SIZE(string);
4861
Guido van Rossumd57fd912000-03-10 22:53:23 +00004862 list = PyList_New(0);
4863 if (!list)
4864 goto onError;
4865
4866 for (i = j = 0; i < len; ) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00004867 Py_ssize_t eol;
Tim Petersced69f82003-09-16 20:30:58 +00004868
Guido van Rossumd57fd912000-03-10 22:53:23 +00004869 /* Find a line and append it */
Thomas Wouters477c8d52006-05-27 19:21:47 +00004870 while (i < len && !BLOOM_LINEBREAK(data[i]))
Guido van Rossumd57fd912000-03-10 22:53:23 +00004871 i++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004872
4873 /* Skip the line break reading CRLF as one line break */
Guido van Rossum86662912000-04-11 15:38:46 +00004874 eol = i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004875 if (i < len) {
4876 if (data[i] == '\r' && i + 1 < len &&
4877 data[i+1] == '\n')
4878 i += 2;
4879 else
4880 i++;
Guido van Rossum86662912000-04-11 15:38:46 +00004881 if (keepends)
4882 eol = i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004883 }
Guido van Rossum86662912000-04-11 15:38:46 +00004884 SPLIT_APPEND(data, j, eol);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004885 j = i;
4886 }
4887 if (j < len) {
4888 SPLIT_APPEND(data, j, len);
4889 }
4890
4891 Py_DECREF(string);
4892 return list;
4893
4894 onError:
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00004895 Py_XDECREF(list);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004896 Py_DECREF(string);
4897 return NULL;
4898}
4899
Tim Petersced69f82003-09-16 20:30:58 +00004900static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004901PyObject *split_char(PyUnicodeObject *self,
4902 PyObject *list,
4903 Py_UNICODE ch,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004904 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004905{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004906 register Py_ssize_t i;
4907 register Py_ssize_t j;
4908 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004909 PyObject *str;
4910
4911 for (i = j = 0; i < len; ) {
4912 if (self->str[i] == ch) {
4913 if (maxcount-- <= 0)
4914 break;
4915 SPLIT_APPEND(self->str, j, i);
4916 i = j = i + 1;
4917 } else
4918 i++;
4919 }
4920 if (j <= len) {
4921 SPLIT_APPEND(self->str, j, len);
4922 }
4923 return list;
4924
4925 onError:
4926 Py_DECREF(list);
4927 return NULL;
4928}
4929
Tim Petersced69f82003-09-16 20:30:58 +00004930static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004931PyObject *split_substring(PyUnicodeObject *self,
4932 PyObject *list,
4933 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004934 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004935{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004936 register Py_ssize_t i;
4937 register Py_ssize_t j;
4938 Py_ssize_t len = self->length;
4939 Py_ssize_t sublen = substring->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004940 PyObject *str;
4941
Guido van Rossumcda4f9a2000-12-19 02:23:19 +00004942 for (i = j = 0; i <= len - sublen; ) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004943 if (Py_UNICODE_MATCH(self, i, substring)) {
4944 if (maxcount-- <= 0)
4945 break;
4946 SPLIT_APPEND(self->str, j, i);
4947 i = j = i + sublen;
4948 } else
4949 i++;
4950 }
4951 if (j <= len) {
4952 SPLIT_APPEND(self->str, j, len);
4953 }
4954 return list;
4955
4956 onError:
4957 Py_DECREF(list);
4958 return NULL;
4959}
4960
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004961static
4962PyObject *rsplit_whitespace(PyUnicodeObject *self,
4963 PyObject *list,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004964 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004965{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004966 register Py_ssize_t i;
4967 register Py_ssize_t j;
4968 Py_ssize_t len = self->length;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004969 PyObject *str;
4970
4971 for (i = j = len - 1; i >= 0; ) {
4972 /* find a token */
4973 while (i >= 0 && Py_UNICODE_ISSPACE(self->str[i]))
4974 i--;
4975 j = i;
4976 while (i >= 0 && !Py_UNICODE_ISSPACE(self->str[i]))
4977 i--;
4978 if (j > i) {
4979 if (maxcount-- <= 0)
4980 break;
Thomas Wouters477c8d52006-05-27 19:21:47 +00004981 SPLIT_APPEND(self->str, i + 1, j + 1);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004982 while (i >= 0 && Py_UNICODE_ISSPACE(self->str[i]))
4983 i--;
4984 j = i;
4985 }
4986 }
4987 if (j >= 0) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00004988 SPLIT_APPEND(self->str, 0, j + 1);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004989 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00004990 if (PyList_Reverse(list) < 0)
4991 goto onError;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004992 return list;
4993
4994 onError:
4995 Py_DECREF(list);
4996 return NULL;
4997}
4998
4999static
5000PyObject *rsplit_char(PyUnicodeObject *self,
5001 PyObject *list,
5002 Py_UNICODE ch,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005003 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005004{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005005 register Py_ssize_t i;
5006 register Py_ssize_t j;
5007 Py_ssize_t len = self->length;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005008 PyObject *str;
5009
5010 for (i = j = len - 1; i >= 0; ) {
5011 if (self->str[i] == ch) {
5012 if (maxcount-- <= 0)
5013 break;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005014 SPLIT_APPEND(self->str, i + 1, j + 1);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005015 j = i = i - 1;
5016 } else
5017 i--;
5018 }
Hye-Shik Chang7fc4cf52003-12-23 09:10:16 +00005019 if (j >= -1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00005020 SPLIT_APPEND(self->str, 0, j + 1);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005021 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00005022 if (PyList_Reverse(list) < 0)
5023 goto onError;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005024 return list;
5025
5026 onError:
5027 Py_DECREF(list);
5028 return NULL;
5029}
5030
5031static
5032PyObject *rsplit_substring(PyUnicodeObject *self,
5033 PyObject *list,
5034 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005035 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005036{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005037 register Py_ssize_t i;
5038 register Py_ssize_t j;
5039 Py_ssize_t len = self->length;
5040 Py_ssize_t sublen = substring->length;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005041 PyObject *str;
5042
5043 for (i = len - sublen, j = len; i >= 0; ) {
5044 if (Py_UNICODE_MATCH(self, i, substring)) {
5045 if (maxcount-- <= 0)
5046 break;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005047 SPLIT_APPEND(self->str, i + sublen, j);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005048 j = i;
5049 i -= sublen;
5050 } else
5051 i--;
5052 }
5053 if (j >= 0) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00005054 SPLIT_APPEND(self->str, 0, j);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005055 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00005056 if (PyList_Reverse(list) < 0)
5057 goto onError;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005058 return list;
5059
5060 onError:
5061 Py_DECREF(list);
5062 return NULL;
5063}
5064
Guido van Rossumd57fd912000-03-10 22:53:23 +00005065#undef SPLIT_APPEND
5066
5067static
5068PyObject *split(PyUnicodeObject *self,
5069 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005070 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005071{
5072 PyObject *list;
5073
5074 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005075 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005076
5077 list = PyList_New(0);
5078 if (!list)
5079 return NULL;
5080
5081 if (substring == NULL)
5082 return split_whitespace(self,list,maxcount);
5083
5084 else if (substring->length == 1)
5085 return split_char(self,list,substring->str[0],maxcount);
5086
5087 else if (substring->length == 0) {
5088 Py_DECREF(list);
5089 PyErr_SetString(PyExc_ValueError, "empty separator");
5090 return NULL;
5091 }
5092 else
5093 return split_substring(self,list,substring,maxcount);
5094}
5095
Tim Petersced69f82003-09-16 20:30:58 +00005096static
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005097PyObject *rsplit(PyUnicodeObject *self,
5098 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005099 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005100{
5101 PyObject *list;
5102
5103 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005104 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005105
5106 list = PyList_New(0);
5107 if (!list)
5108 return NULL;
5109
5110 if (substring == NULL)
5111 return rsplit_whitespace(self,list,maxcount);
5112
5113 else if (substring->length == 1)
5114 return rsplit_char(self,list,substring->str[0],maxcount);
5115
5116 else if (substring->length == 0) {
5117 Py_DECREF(list);
5118 PyErr_SetString(PyExc_ValueError, "empty separator");
5119 return NULL;
5120 }
5121 else
5122 return rsplit_substring(self,list,substring,maxcount);
5123}
5124
5125static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005126PyObject *replace(PyUnicodeObject *self,
5127 PyUnicodeObject *str1,
5128 PyUnicodeObject *str2,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005129 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005130{
5131 PyUnicodeObject *u;
5132
5133 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005134 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005135
Thomas Wouters477c8d52006-05-27 19:21:47 +00005136 if (str1->length == str2->length) {
5137 /* same length */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005138 Py_ssize_t i;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005139 if (str1->length == 1) {
5140 /* replace characters */
5141 Py_UNICODE u1, u2;
5142 if (!findchar(self->str, self->length, str1->str[0]))
5143 goto nothing;
5144 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
5145 if (!u)
5146 return NULL;
5147 Py_UNICODE_COPY(u->str, self->str, self->length);
5148 u1 = str1->str[0];
5149 u2 = str2->str[0];
5150 for (i = 0; i < u->length; i++)
5151 if (u->str[i] == u1) {
5152 if (--maxcount < 0)
5153 break;
5154 u->str[i] = u2;
5155 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005156 } else {
Thomas Wouters477c8d52006-05-27 19:21:47 +00005157 i = fastsearch(
5158 self->str, self->length, str1->str, str1->length, FAST_SEARCH
Guido van Rossumd57fd912000-03-10 22:53:23 +00005159 );
Thomas Wouters477c8d52006-05-27 19:21:47 +00005160 if (i < 0)
5161 goto nothing;
5162 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
5163 if (!u)
5164 return NULL;
5165 Py_UNICODE_COPY(u->str, self->str, self->length);
5166 while (i <= self->length - str1->length)
5167 if (Py_UNICODE_MATCH(self, i, str1)) {
5168 if (--maxcount < 0)
5169 break;
5170 Py_UNICODE_COPY(u->str+i, str2->str, str2->length);
5171 i += str1->length;
5172 } else
5173 i++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005174 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005175 } else {
Thomas Wouters477c8d52006-05-27 19:21:47 +00005176
5177 Py_ssize_t n, i, j, e;
5178 Py_ssize_t product, new_size, delta;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005179 Py_UNICODE *p;
5180
5181 /* replace strings */
Thomas Wouters477c8d52006-05-27 19:21:47 +00005182 n = stringlib_count(self->str, self->length, str1->str, str1->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005183 if (n > maxcount)
5184 n = maxcount;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005185 if (n == 0)
5186 goto nothing;
5187 /* new_size = self->length + n * (str2->length - str1->length)); */
5188 delta = (str2->length - str1->length);
5189 if (delta == 0) {
5190 new_size = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005191 } else {
Thomas Wouters477c8d52006-05-27 19:21:47 +00005192 product = n * (str2->length - str1->length);
5193 if ((product / (str2->length - str1->length)) != n) {
5194 PyErr_SetString(PyExc_OverflowError,
5195 "replace string is too long");
5196 return NULL;
5197 }
5198 new_size = self->length + product;
5199 if (new_size < 0) {
5200 PyErr_SetString(PyExc_OverflowError,
5201 "replace string is too long");
5202 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005203 }
5204 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00005205 u = _PyUnicode_New(new_size);
5206 if (!u)
5207 return NULL;
5208 i = 0;
5209 p = u->str;
5210 e = self->length - str1->length;
5211 if (str1->length > 0) {
5212 while (n-- > 0) {
5213 /* look for next match */
5214 j = i;
5215 while (j <= e) {
5216 if (Py_UNICODE_MATCH(self, j, str1))
5217 break;
5218 j++;
5219 }
5220 if (j > i) {
5221 if (j > e)
5222 break;
5223 /* copy unchanged part [i:j] */
5224 Py_UNICODE_COPY(p, self->str+i, j-i);
5225 p += j - i;
5226 }
5227 /* copy substitution string */
5228 if (str2->length > 0) {
5229 Py_UNICODE_COPY(p, str2->str, str2->length);
5230 p += str2->length;
5231 }
5232 i = j + str1->length;
5233 }
5234 if (i < self->length)
5235 /* copy tail [i:] */
5236 Py_UNICODE_COPY(p, self->str+i, self->length-i);
5237 } else {
5238 /* interleave */
5239 while (n > 0) {
5240 Py_UNICODE_COPY(p, str2->str, str2->length);
5241 p += str2->length;
5242 if (--n <= 0)
5243 break;
5244 *p++ = self->str[i++];
5245 }
5246 Py_UNICODE_COPY(p, self->str+i, self->length-i);
5247 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005248 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005249 return (PyObject *) u;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005250
5251nothing:
5252 /* nothing to replace; return original string (when possible) */
5253 if (PyUnicode_CheckExact(self)) {
5254 Py_INCREF(self);
5255 return (PyObject *) self;
5256 }
5257 return PyUnicode_FromUnicode(self->str, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005258}
5259
5260/* --- Unicode Object Methods --------------------------------------------- */
5261
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005262PyDoc_STRVAR(title__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005263"S.title() -> unicode\n\
5264\n\
5265Return a titlecased version of S, i.e. words start with title case\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005266characters, all remaining cased characters have lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005267
5268static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005269unicode_title(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005270{
Guido van Rossumd57fd912000-03-10 22:53:23 +00005271 return fixup(self, fixtitle);
5272}
5273
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005274PyDoc_STRVAR(capitalize__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005275"S.capitalize() -> unicode\n\
5276\n\
5277Return a capitalized version of S, i.e. make the first character\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005278have upper case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005279
5280static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005281unicode_capitalize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005282{
Guido van Rossumd57fd912000-03-10 22:53:23 +00005283 return fixup(self, fixcapitalize);
5284}
5285
5286#if 0
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005287PyDoc_STRVAR(capwords__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005288"S.capwords() -> unicode\n\
5289\n\
5290Apply .capitalize() to all words in S and return the result with\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005291normalized whitespace (all whitespace strings are replaced by ' ').");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005292
5293static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005294unicode_capwords(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005295{
5296 PyObject *list;
5297 PyObject *item;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005298 Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005299
Guido van Rossumd57fd912000-03-10 22:53:23 +00005300 /* Split into words */
5301 list = split(self, NULL, -1);
5302 if (!list)
5303 return NULL;
5304
5305 /* Capitalize each word */
5306 for (i = 0; i < PyList_GET_SIZE(list); i++) {
5307 item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
5308 fixcapitalize);
5309 if (item == NULL)
5310 goto onError;
5311 Py_DECREF(PyList_GET_ITEM(list, i));
5312 PyList_SET_ITEM(list, i, item);
5313 }
5314
5315 /* Join the words to form a new string */
5316 item = PyUnicode_Join(NULL, list);
5317
5318onError:
5319 Py_DECREF(list);
5320 return (PyObject *)item;
5321}
5322#endif
5323
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00005324/* Argument converter. Coerces to a single unicode character */
5325
5326static int
5327convert_uc(PyObject *obj, void *addr)
5328{
5329 Py_UNICODE *fillcharloc = (Py_UNICODE *)addr;
5330 PyObject *uniobj;
5331 Py_UNICODE *unistr;
5332
5333 uniobj = PyUnicode_FromObject(obj);
5334 if (uniobj == NULL) {
5335 PyErr_SetString(PyExc_TypeError,
5336 "The fill character cannot be converted to Unicode");
5337 return 0;
5338 }
5339 if (PyUnicode_GET_SIZE(uniobj) != 1) {
5340 PyErr_SetString(PyExc_TypeError,
5341 "The fill character must be exactly one character long");
5342 Py_DECREF(uniobj);
5343 return 0;
5344 }
5345 unistr = PyUnicode_AS_UNICODE(uniobj);
5346 *fillcharloc = unistr[0];
5347 Py_DECREF(uniobj);
5348 return 1;
5349}
5350
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005351PyDoc_STRVAR(center__doc__,
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00005352"S.center(width[, fillchar]) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005353\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00005354Return S centered in a Unicode string of length width. Padding is\n\
5355done using the specified fill character (default is a space)");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005356
5357static PyObject *
5358unicode_center(PyUnicodeObject *self, PyObject *args)
5359{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005360 Py_ssize_t marg, left;
5361 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00005362 Py_UNICODE fillchar = ' ';
Guido van Rossumd57fd912000-03-10 22:53:23 +00005363
Thomas Woutersde017742006-02-16 19:34:37 +00005364 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005365 return NULL;
5366
Tim Peters7a29bd52001-09-12 03:03:31 +00005367 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005368 Py_INCREF(self);
5369 return (PyObject*) self;
5370 }
5371
5372 marg = width - self->length;
5373 left = marg / 2 + (marg & width & 1);
5374
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00005375 return (PyObject*) pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005376}
5377
Marc-André Lemburge5034372000-08-08 08:04:29 +00005378#if 0
5379
5380/* This code should go into some future Unicode collation support
5381 module. The basic comparison should compare ordinals on a naive
Trent Mick20abf572000-08-12 22:14:34 +00005382 basis (this is what Java does and thus JPython too). */
Marc-André Lemburge5034372000-08-08 08:04:29 +00005383
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005384/* speedy UTF-16 code point order comparison */
5385/* gleaned from: */
5386/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
5387
Marc-André Lemburge12896e2000-07-07 17:51:08 +00005388static short utf16Fixup[32] =
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005389{
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005390 0, 0, 0, 0, 0, 0, 0, 0,
Tim Petersced69f82003-09-16 20:30:58 +00005391 0, 0, 0, 0, 0, 0, 0, 0,
5392 0, 0, 0, 0, 0, 0, 0, 0,
Marc-André Lemburge12896e2000-07-07 17:51:08 +00005393 0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005394};
5395
Guido van Rossumd57fd912000-03-10 22:53:23 +00005396static int
5397unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
5398{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005399 Py_ssize_t len1, len2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005400
Guido van Rossumd57fd912000-03-10 22:53:23 +00005401 Py_UNICODE *s1 = str1->str;
5402 Py_UNICODE *s2 = str2->str;
5403
5404 len1 = str1->length;
5405 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00005406
Guido van Rossumd57fd912000-03-10 22:53:23 +00005407 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00005408 Py_UNICODE c1, c2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005409
5410 c1 = *s1++;
5411 c2 = *s2++;
Fredrik Lundh45714e92001-06-26 16:39:36 +00005412
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005413 if (c1 > (1<<11) * 26)
5414 c1 += utf16Fixup[c1>>11];
5415 if (c2 > (1<<11) * 26)
5416 c2 += utf16Fixup[c2>>11];
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005417 /* now c1 and c2 are in UTF-32-compatible order */
Fredrik Lundh45714e92001-06-26 16:39:36 +00005418
5419 if (c1 != c2)
5420 return (c1 < c2) ? -1 : 1;
Tim Petersced69f82003-09-16 20:30:58 +00005421
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005422 len1--; len2--;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005423 }
5424
5425 return (len1 < len2) ? -1 : (len1 != len2);
5426}
5427
Marc-André Lemburge5034372000-08-08 08:04:29 +00005428#else
5429
5430static int
5431unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
5432{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005433 register Py_ssize_t len1, len2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00005434
5435 Py_UNICODE *s1 = str1->str;
5436 Py_UNICODE *s2 = str2->str;
5437
5438 len1 = str1->length;
5439 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00005440
Marc-André Lemburge5034372000-08-08 08:04:29 +00005441 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00005442 Py_UNICODE c1, c2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00005443
Fredrik Lundh45714e92001-06-26 16:39:36 +00005444 c1 = *s1++;
5445 c2 = *s2++;
5446
5447 if (c1 != c2)
5448 return (c1 < c2) ? -1 : 1;
5449
Marc-André Lemburge5034372000-08-08 08:04:29 +00005450 len1--; len2--;
5451 }
5452
5453 return (len1 < len2) ? -1 : (len1 != len2);
5454}
5455
5456#endif
5457
Guido van Rossumd57fd912000-03-10 22:53:23 +00005458int PyUnicode_Compare(PyObject *left,
5459 PyObject *right)
5460{
Guido van Rossum09dc34f2007-05-04 04:17:33 +00005461 if (PyUnicode_Check(left) && PyUnicode_Check(right))
5462 return unicode_compare((PyUnicodeObject *)left,
5463 (PyUnicodeObject *)right);
5464 if ((PyString_Check(left) && PyUnicode_Check(right)) ||
5465 (PyUnicode_Check(left) && PyString_Check(right))) {
5466 if (PyUnicode_Check(left))
5467 left = _PyUnicode_AsDefaultEncodedString(left, NULL);
5468 if (PyUnicode_Check(right))
5469 right = _PyUnicode_AsDefaultEncodedString(right, NULL);
5470 assert(PyString_Check(left));
5471 assert(PyString_Check(right));
5472 return PyObject_Compare(left, right);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005473 }
Guido van Rossum09dc34f2007-05-04 04:17:33 +00005474 PyErr_Format(PyExc_TypeError,
5475 "Can't compare %.100s and %.100s",
5476 left->ob_type->tp_name,
5477 right->ob_type->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005478 return -1;
5479}
5480
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00005481PyObject *PyUnicode_RichCompare(PyObject *left,
5482 PyObject *right,
5483 int op)
5484{
5485 int result;
5486
5487 result = PyUnicode_Compare(left, right);
5488 if (result == -1 && PyErr_Occurred())
5489 goto onError;
5490
5491 /* Convert the return value to a Boolean */
5492 switch (op) {
5493 case Py_EQ:
5494 result = (result == 0);
5495 break;
5496 case Py_NE:
5497 result = (result != 0);
5498 break;
5499 case Py_LE:
5500 result = (result <= 0);
5501 break;
5502 case Py_GE:
5503 result = (result >= 0);
5504 break;
5505 case Py_LT:
5506 result = (result == -1);
5507 break;
5508 case Py_GT:
5509 result = (result == 1);
5510 break;
5511 }
5512 return PyBool_FromLong(result);
5513
5514 onError:
5515
5516 /* Standard case
5517
5518 Type errors mean that PyUnicode_FromObject() could not convert
5519 one of the arguments (usually the right hand side) to Unicode,
5520 ie. we can't handle the comparison request. However, it is
5521 possible that the other object knows a comparison method, which
5522 is why we return Py_NotImplemented to give the other object a
5523 chance.
5524
5525 */
5526 if (PyErr_ExceptionMatches(PyExc_TypeError)) {
5527 PyErr_Clear();
5528 Py_INCREF(Py_NotImplemented);
5529 return Py_NotImplemented;
5530 }
5531 if (op != Py_EQ && op != Py_NE)
5532 return NULL;
5533
5534 /* Equality comparison.
5535
5536 This is a special case: we silence any PyExc_UnicodeDecodeError
5537 and instead turn it into a PyErr_UnicodeWarning.
5538
5539 */
5540 if (!PyErr_ExceptionMatches(PyExc_UnicodeDecodeError))
5541 return NULL;
5542 PyErr_Clear();
5543 if (PyErr_Warn(PyExc_UnicodeWarning,
5544 (op == Py_EQ) ?
5545 "Unicode equal comparison "
5546 "failed to convert both arguments to Unicode - "
5547 "interpreting them as being unequal" :
5548 "Unicode unequal comparison "
5549 "failed to convert both arguments to Unicode - "
5550 "interpreting them as being unequal"
5551 ) < 0)
5552 return NULL;
5553 result = (op == Py_NE);
5554 return PyBool_FromLong(result);
5555}
5556
Guido van Rossum403d68b2000-03-13 15:55:09 +00005557int PyUnicode_Contains(PyObject *container,
5558 PyObject *element)
5559{
Thomas Wouters477c8d52006-05-27 19:21:47 +00005560 PyObject *str, *sub;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005561 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00005562
5563 /* Coerce the two arguments */
Thomas Wouters477c8d52006-05-27 19:21:47 +00005564 sub = PyUnicode_FromObject(element);
5565 if (!sub) {
Marc-André Lemburg7c014682000-06-28 08:11:47 +00005566 PyErr_SetString(PyExc_TypeError,
Barry Warsaw817918c2002-08-06 16:58:21 +00005567 "'in <string>' requires string as left operand");
Thomas Wouters477c8d52006-05-27 19:21:47 +00005568 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +00005569 }
5570
Thomas Wouters477c8d52006-05-27 19:21:47 +00005571 str = PyUnicode_FromObject(container);
5572 if (!str) {
5573 Py_DECREF(sub);
5574 return -1;
5575 }
5576
5577 result = stringlib_contains_obj(str, sub);
5578
5579 Py_DECREF(str);
5580 Py_DECREF(sub);
5581
Guido van Rossum403d68b2000-03-13 15:55:09 +00005582 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00005583}
5584
Guido van Rossumd57fd912000-03-10 22:53:23 +00005585/* Concat to string or Unicode object giving a new Unicode object. */
5586
5587PyObject *PyUnicode_Concat(PyObject *left,
5588 PyObject *right)
5589{
5590 PyUnicodeObject *u = NULL, *v = NULL, *w;
5591
Guido van Rossum84d79dd2007-04-13 02:23:57 +00005592 if (PyBytes_Check(left) || PyBytes_Check(right))
5593 return PyBytes_Concat(left, right);
5594
Guido van Rossumd57fd912000-03-10 22:53:23 +00005595 /* Coerce the two arguments */
5596 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
5597 if (u == NULL)
5598 goto onError;
5599 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
5600 if (v == NULL)
5601 goto onError;
5602
5603 /* Shortcuts */
5604 if (v == unicode_empty) {
5605 Py_DECREF(v);
5606 return (PyObject *)u;
5607 }
5608 if (u == unicode_empty) {
5609 Py_DECREF(u);
5610 return (PyObject *)v;
5611 }
5612
5613 /* Concat the two Unicode strings */
5614 w = _PyUnicode_New(u->length + v->length);
5615 if (w == NULL)
5616 goto onError;
5617 Py_UNICODE_COPY(w->str, u->str, u->length);
5618 Py_UNICODE_COPY(w->str + u->length, v->str, v->length);
5619
5620 Py_DECREF(u);
5621 Py_DECREF(v);
5622 return (PyObject *)w;
5623
5624onError:
5625 Py_XDECREF(u);
5626 Py_XDECREF(v);
5627 return NULL;
5628}
5629
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005630PyDoc_STRVAR(count__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005631"S.count(sub[, start[, end]]) -> int\n\
5632\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00005633Return the number of non-overlapping occurrences of substring sub in\n\
5634Unicode string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005635interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005636
5637static PyObject *
5638unicode_count(PyUnicodeObject *self, PyObject *args)
5639{
5640 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005641 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005642 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005643 PyObject *result;
5644
Guido van Rossumb8872e62000-05-09 14:14:27 +00005645 if (!PyArg_ParseTuple(args, "O|O&O&:count", &substring,
5646 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005647 return NULL;
5648
5649 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Thomas Wouters477c8d52006-05-27 19:21:47 +00005650 (PyObject *)substring);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005651 if (substring == NULL)
5652 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +00005653
Thomas Wouters477c8d52006-05-27 19:21:47 +00005654 FIX_START_END(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005655
Thomas Wouters477c8d52006-05-27 19:21:47 +00005656 result = PyInt_FromSsize_t(
5657 stringlib_count(self->str + start, end - start,
5658 substring->str, substring->length)
5659 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00005660
5661 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00005662
Guido van Rossumd57fd912000-03-10 22:53:23 +00005663 return result;
5664}
5665
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005666PyDoc_STRVAR(encode__doc__,
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00005667"S.encode([encoding[,errors]]) -> string or unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005668\n\
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00005669Encodes S using the codec registered for encoding. encoding defaults\n\
5670to the default encoding. errors may be given to set a different error\n\
Fred Drakee4315f52000-05-09 19:53:39 +00005671handling scheme. Default is 'strict' meaning that encoding errors raise\n\
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005672a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
5673'xmlcharrefreplace' as well as any other name registered with\n\
5674codecs.register_error that can handle UnicodeEncodeErrors.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005675
5676static PyObject *
5677unicode_encode(PyUnicodeObject *self, PyObject *args)
5678{
5679 char *encoding = NULL;
5680 char *errors = NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00005681 PyObject *v;
5682
Guido van Rossumd57fd912000-03-10 22:53:23 +00005683 if (!PyArg_ParseTuple(args, "|ss:encode", &encoding, &errors))
5684 return NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00005685 v = PyUnicode_AsEncodedObject((PyObject *)self, encoding, errors);
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00005686 if (v == NULL)
5687 goto onError;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00005688 if (!PyBytes_Check(v)) {
Guido van Rossum4355a472007-05-04 05:00:04 +00005689 if (PyString_Check(v)) {
5690 /* Old codec, turn it into bytes */
5691 PyObject *b = PyBytes_FromObject(v);
5692 Py_DECREF(v);
5693 return b;
5694 }
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00005695 PyErr_Format(PyExc_TypeError,
Guido van Rossumf15a29f2007-05-04 00:41:39 +00005696 "encoder did not return a bytes object "
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00005697 "(type=%.400s)",
5698 v->ob_type->tp_name);
5699 Py_DECREF(v);
5700 return NULL;
5701 }
5702 return v;
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00005703
5704 onError:
5705 return NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00005706}
5707
5708PyDoc_STRVAR(decode__doc__,
5709"S.decode([encoding[,errors]]) -> string or unicode\n\
5710\n\
5711Decodes S using the codec registered for encoding. encoding defaults\n\
5712to the default encoding. errors may be given to set a different error\n\
5713handling scheme. Default is 'strict' meaning that encoding errors raise\n\
5714a UnicodeDecodeError. Other possible values are 'ignore' and 'replace'\n\
5715as well as any other name registerd with codecs.register_error that is\n\
5716able to handle UnicodeDecodeErrors.");
5717
5718static PyObject *
Marc-André Lemburg126b44c2004-07-10 12:04:20 +00005719unicode_decode(PyUnicodeObject *self, PyObject *args)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00005720{
5721 char *encoding = NULL;
5722 char *errors = NULL;
5723 PyObject *v;
5724
5725 if (!PyArg_ParseTuple(args, "|ss:decode", &encoding, &errors))
5726 return NULL;
5727 v = PyUnicode_AsDecodedObject((PyObject *)self, encoding, errors);
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00005728 if (v == NULL)
5729 goto onError;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00005730 if (!PyString_Check(v) && !PyUnicode_Check(v)) {
5731 PyErr_Format(PyExc_TypeError,
5732 "decoder did not return a string/unicode object "
5733 "(type=%.400s)",
5734 v->ob_type->tp_name);
5735 Py_DECREF(v);
5736 return NULL;
5737 }
5738 return v;
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00005739
5740 onError:
5741 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005742}
5743
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005744PyDoc_STRVAR(expandtabs__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005745"S.expandtabs([tabsize]) -> unicode\n\
5746\n\
5747Return a copy of S where all tab characters are expanded using spaces.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005748If tabsize is not given, a tab size of 8 characters is assumed.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005749
5750static PyObject*
5751unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
5752{
5753 Py_UNICODE *e;
5754 Py_UNICODE *p;
5755 Py_UNICODE *q;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005756 Py_ssize_t i, j;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005757 PyUnicodeObject *u;
5758 int tabsize = 8;
5759
5760 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
5761 return NULL;
5762
Thomas Wouters7e474022000-07-16 12:04:32 +00005763 /* First pass: determine size of output string */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005764 i = j = 0;
5765 e = self->str + self->length;
5766 for (p = self->str; p < e; p++)
5767 if (*p == '\t') {
5768 if (tabsize > 0)
5769 j += tabsize - (j % tabsize);
5770 }
5771 else {
5772 j++;
5773 if (*p == '\n' || *p == '\r') {
5774 i += j;
5775 j = 0;
5776 }
5777 }
5778
5779 /* Second pass: create output string and fill it */
5780 u = _PyUnicode_New(i + j);
5781 if (!u)
5782 return NULL;
5783
5784 j = 0;
5785 q = u->str;
5786
5787 for (p = self->str; p < e; p++)
5788 if (*p == '\t') {
5789 if (tabsize > 0) {
5790 i = tabsize - (j % tabsize);
5791 j += i;
5792 while (i--)
5793 *q++ = ' ';
5794 }
5795 }
5796 else {
5797 j++;
5798 *q++ = *p;
5799 if (*p == '\n' || *p == '\r')
5800 j = 0;
5801 }
5802
5803 return (PyObject*) u;
5804}
5805
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005806PyDoc_STRVAR(find__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005807"S.find(sub [,start [,end]]) -> int\n\
5808\n\
5809Return the lowest index in S where substring sub is found,\n\
5810such that sub is contained within s[start,end]. Optional\n\
5811arguments start and end are interpreted as in slice notation.\n\
5812\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005813Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005814
5815static PyObject *
5816unicode_find(PyUnicodeObject *self, PyObject *args)
5817{
Thomas Wouters477c8d52006-05-27 19:21:47 +00005818 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005819 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005820 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005821 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005822
Guido van Rossumb8872e62000-05-09 14:14:27 +00005823 if (!PyArg_ParseTuple(args, "O|O&O&:find", &substring,
5824 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005825 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005826 substring = PyUnicode_FromObject(substring);
5827 if (!substring)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005828 return NULL;
5829
Thomas Wouters477c8d52006-05-27 19:21:47 +00005830 result = stringlib_find_slice(
5831 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
5832 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
5833 start, end
5834 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00005835
5836 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00005837
5838 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005839}
5840
5841static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +00005842unicode_getitem(PyUnicodeObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005843{
5844 if (index < 0 || index >= self->length) {
5845 PyErr_SetString(PyExc_IndexError, "string index out of range");
5846 return NULL;
5847 }
5848
5849 return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1);
5850}
5851
5852static long
Guido van Rossum09dc34f2007-05-04 04:17:33 +00005853unicode_hash(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005854{
Guido van Rossum09dc34f2007-05-04 04:17:33 +00005855 /* Since Unicode objects compare equal to their UTF-8 string
5856 counterparts, we hash the UTF-8 string. */
5857 PyObject *v = _PyUnicode_AsDefaultEncodedString(self, NULL);
5858 return PyObject_Hash(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005859}
5860
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005861PyDoc_STRVAR(index__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005862"S.index(sub [,start [,end]]) -> int\n\
5863\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005864Like S.find() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005865
5866static PyObject *
5867unicode_index(PyUnicodeObject *self, PyObject *args)
5868{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005869 Py_ssize_t result;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005870 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005871 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005872 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005873
Guido van Rossumb8872e62000-05-09 14:14:27 +00005874 if (!PyArg_ParseTuple(args, "O|O&O&:index", &substring,
5875 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005876 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005877 substring = PyUnicode_FromObject(substring);
5878 if (!substring)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005879 return NULL;
5880
Thomas Wouters477c8d52006-05-27 19:21:47 +00005881 result = stringlib_find_slice(
5882 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
5883 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
5884 start, end
5885 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00005886
5887 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00005888
Guido van Rossumd57fd912000-03-10 22:53:23 +00005889 if (result < 0) {
5890 PyErr_SetString(PyExc_ValueError, "substring not found");
5891 return NULL;
5892 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00005893
Martin v. Löwis18e16552006-02-15 17:27:45 +00005894 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005895}
5896
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005897PyDoc_STRVAR(islower__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005898"S.islower() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005899\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00005900Return True if all cased characters in S are lowercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005901at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005902
5903static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005904unicode_islower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005905{
5906 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5907 register const Py_UNICODE *e;
5908 int cased;
5909
Guido van Rossumd57fd912000-03-10 22:53:23 +00005910 /* Shortcut for single character strings */
5911 if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005912 return PyBool_FromLong(Py_UNICODE_ISLOWER(*p));
Guido van Rossumd57fd912000-03-10 22:53:23 +00005913
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005914 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00005915 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005916 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005917
Guido van Rossumd57fd912000-03-10 22:53:23 +00005918 e = p + PyUnicode_GET_SIZE(self);
5919 cased = 0;
5920 for (; p < e; p++) {
5921 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00005922
Guido van Rossumd57fd912000-03-10 22:53:23 +00005923 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005924 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005925 else if (!cased && Py_UNICODE_ISLOWER(ch))
5926 cased = 1;
5927 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00005928 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005929}
5930
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005931PyDoc_STRVAR(isupper__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005932"S.isupper() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005933\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00005934Return True if all cased characters in S are uppercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005935at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005936
5937static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005938unicode_isupper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005939{
5940 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5941 register const Py_UNICODE *e;
5942 int cased;
5943
Guido van Rossumd57fd912000-03-10 22:53:23 +00005944 /* Shortcut for single character strings */
5945 if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005946 return PyBool_FromLong(Py_UNICODE_ISUPPER(*p) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005947
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005948 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00005949 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005950 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005951
Guido van Rossumd57fd912000-03-10 22:53:23 +00005952 e = p + PyUnicode_GET_SIZE(self);
5953 cased = 0;
5954 for (; p < e; p++) {
5955 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00005956
Guido van Rossumd57fd912000-03-10 22:53:23 +00005957 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005958 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005959 else if (!cased && Py_UNICODE_ISUPPER(ch))
5960 cased = 1;
5961 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00005962 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005963}
5964
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005965PyDoc_STRVAR(istitle__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005966"S.istitle() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005967\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00005968Return True if S is a titlecased string and there is at least one\n\
5969character in S, i.e. upper- and titlecase characters may only\n\
5970follow uncased characters and lowercase characters only cased ones.\n\
5971Return False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005972
5973static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005974unicode_istitle(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005975{
5976 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5977 register const Py_UNICODE *e;
5978 int cased, previous_is_cased;
5979
Guido van Rossumd57fd912000-03-10 22:53:23 +00005980 /* Shortcut for single character strings */
5981 if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005982 return PyBool_FromLong((Py_UNICODE_ISTITLE(*p) != 0) ||
5983 (Py_UNICODE_ISUPPER(*p) != 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +00005984
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005985 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00005986 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005987 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005988
Guido van Rossumd57fd912000-03-10 22:53:23 +00005989 e = p + PyUnicode_GET_SIZE(self);
5990 cased = 0;
5991 previous_is_cased = 0;
5992 for (; p < e; p++) {
5993 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00005994
Guido van Rossumd57fd912000-03-10 22:53:23 +00005995 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
5996 if (previous_is_cased)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005997 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005998 previous_is_cased = 1;
5999 cased = 1;
6000 }
6001 else if (Py_UNICODE_ISLOWER(ch)) {
6002 if (!previous_is_cased)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006003 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006004 previous_is_cased = 1;
6005 cased = 1;
6006 }
6007 else
6008 previous_is_cased = 0;
6009 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006010 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006011}
6012
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006013PyDoc_STRVAR(isspace__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006014"S.isspace() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006015\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006016Return True if all characters in S are whitespace\n\
6017and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006018
6019static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006020unicode_isspace(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006021{
6022 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6023 register const Py_UNICODE *e;
6024
Guido van Rossumd57fd912000-03-10 22:53:23 +00006025 /* Shortcut for single character strings */
6026 if (PyUnicode_GET_SIZE(self) == 1 &&
6027 Py_UNICODE_ISSPACE(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006028 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006029
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006030 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006031 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006032 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006033
Guido van Rossumd57fd912000-03-10 22:53:23 +00006034 e = p + PyUnicode_GET_SIZE(self);
6035 for (; p < e; p++) {
6036 if (!Py_UNICODE_ISSPACE(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006037 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006038 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006039 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006040}
6041
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006042PyDoc_STRVAR(isalpha__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006043"S.isalpha() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006044\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006045Return True if all characters in S are alphabetic\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006046and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006047
6048static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006049unicode_isalpha(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006050{
6051 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6052 register const Py_UNICODE *e;
6053
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006054 /* Shortcut for single character strings */
6055 if (PyUnicode_GET_SIZE(self) == 1 &&
6056 Py_UNICODE_ISALPHA(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006057 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006058
6059 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006060 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006061 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006062
6063 e = p + PyUnicode_GET_SIZE(self);
6064 for (; p < e; p++) {
6065 if (!Py_UNICODE_ISALPHA(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006066 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006067 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006068 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006069}
6070
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006071PyDoc_STRVAR(isalnum__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006072"S.isalnum() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006073\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006074Return True if all characters in S are alphanumeric\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006075and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006076
6077static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006078unicode_isalnum(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006079{
6080 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6081 register const Py_UNICODE *e;
6082
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006083 /* Shortcut for single character strings */
6084 if (PyUnicode_GET_SIZE(self) == 1 &&
6085 Py_UNICODE_ISALNUM(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006086 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006087
6088 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006089 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006090 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006091
6092 e = p + PyUnicode_GET_SIZE(self);
6093 for (; p < e; p++) {
6094 if (!Py_UNICODE_ISALNUM(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006095 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006096 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006097 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006098}
6099
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006100PyDoc_STRVAR(isdecimal__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006101"S.isdecimal() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006102\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00006103Return True if there are only decimal characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006104False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006105
6106static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006107unicode_isdecimal(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006108{
6109 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6110 register const Py_UNICODE *e;
6111
Guido van Rossumd57fd912000-03-10 22:53:23 +00006112 /* Shortcut for single character strings */
6113 if (PyUnicode_GET_SIZE(self) == 1 &&
6114 Py_UNICODE_ISDECIMAL(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006115 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006116
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006117 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006118 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006119 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006120
Guido van Rossumd57fd912000-03-10 22:53:23 +00006121 e = p + PyUnicode_GET_SIZE(self);
6122 for (; p < e; p++) {
6123 if (!Py_UNICODE_ISDECIMAL(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006124 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006125 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006126 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006127}
6128
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006129PyDoc_STRVAR(isdigit__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006130"S.isdigit() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006131\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006132Return True if all characters in S are digits\n\
6133and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006134
6135static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006136unicode_isdigit(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006137{
6138 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6139 register const Py_UNICODE *e;
6140
Guido van Rossumd57fd912000-03-10 22:53:23 +00006141 /* Shortcut for single character strings */
6142 if (PyUnicode_GET_SIZE(self) == 1 &&
6143 Py_UNICODE_ISDIGIT(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006144 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006145
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006146 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006147 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006148 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006149
Guido van Rossumd57fd912000-03-10 22:53:23 +00006150 e = p + PyUnicode_GET_SIZE(self);
6151 for (; p < e; p++) {
6152 if (!Py_UNICODE_ISDIGIT(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006153 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006154 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006155 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006156}
6157
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006158PyDoc_STRVAR(isnumeric__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006159"S.isnumeric() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006160\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00006161Return True if there are only numeric characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006162False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006163
6164static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006165unicode_isnumeric(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006166{
6167 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6168 register const Py_UNICODE *e;
6169
Guido van Rossumd57fd912000-03-10 22:53:23 +00006170 /* Shortcut for single character strings */
6171 if (PyUnicode_GET_SIZE(self) == 1 &&
6172 Py_UNICODE_ISNUMERIC(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006173 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006174
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006175 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006176 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006177 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006178
Guido van Rossumd57fd912000-03-10 22:53:23 +00006179 e = p + PyUnicode_GET_SIZE(self);
6180 for (; p < e; p++) {
6181 if (!Py_UNICODE_ISNUMERIC(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006182 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006183 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006184 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006185}
6186
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006187PyDoc_STRVAR(join__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006188"S.join(sequence) -> unicode\n\
6189\n\
6190Return a string which is the concatenation of the strings in the\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006191sequence. The separator between elements is S.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006192
6193static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006194unicode_join(PyObject *self, PyObject *data)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006195{
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006196 return PyUnicode_Join(self, data);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006197}
6198
Martin v. Löwis18e16552006-02-15 17:27:45 +00006199static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +00006200unicode_length(PyUnicodeObject *self)
6201{
6202 return self->length;
6203}
6204
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006205PyDoc_STRVAR(ljust__doc__,
Hye-Shik Chang974ed7c2004-06-02 16:49:17 +00006206"S.ljust(width[, fillchar]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006207\n\
6208Return S left justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006209done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006210
6211static PyObject *
6212unicode_ljust(PyUnicodeObject *self, PyObject *args)
6213{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006214 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006215 Py_UNICODE fillchar = ' ';
6216
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006217 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006218 return NULL;
6219
Tim Peters7a29bd52001-09-12 03:03:31 +00006220 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006221 Py_INCREF(self);
6222 return (PyObject*) self;
6223 }
6224
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006225 return (PyObject*) pad(self, 0, width - self->length, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006226}
6227
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006228PyDoc_STRVAR(lower__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006229"S.lower() -> unicode\n\
6230\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006231Return a copy of the string S converted to lowercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006232
6233static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006234unicode_lower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006235{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006236 return fixup(self, fixlower);
6237}
6238
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006239#define LEFTSTRIP 0
6240#define RIGHTSTRIP 1
6241#define BOTHSTRIP 2
6242
6243/* Arrays indexed by above */
6244static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
6245
6246#define STRIPNAME(i) (stripformat[i]+3)
6247
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006248/* externally visible for str.strip(unicode) */
6249PyObject *
6250_PyUnicode_XStrip(PyUnicodeObject *self, int striptype, PyObject *sepobj)
6251{
6252 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
Martin v. Löwis18e16552006-02-15 17:27:45 +00006253 Py_ssize_t len = PyUnicode_GET_SIZE(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006254 Py_UNICODE *sep = PyUnicode_AS_UNICODE(sepobj);
Martin v. Löwis18e16552006-02-15 17:27:45 +00006255 Py_ssize_t seplen = PyUnicode_GET_SIZE(sepobj);
6256 Py_ssize_t i, j;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006257
Thomas Wouters477c8d52006-05-27 19:21:47 +00006258 BLOOM_MASK sepmask = make_bloom_mask(sep, seplen);
6259
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006260 i = 0;
6261 if (striptype != RIGHTSTRIP) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00006262 while (i < len && BLOOM_MEMBER(sepmask, s[i], sep, seplen)) {
6263 i++;
6264 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006265 }
6266
6267 j = len;
6268 if (striptype != LEFTSTRIP) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00006269 do {
6270 j--;
6271 } while (j >= i && BLOOM_MEMBER(sepmask, s[j], sep, seplen));
6272 j++;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006273 }
6274
6275 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00006276 Py_INCREF(self);
6277 return (PyObject*)self;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006278 }
6279 else
Thomas Wouters477c8d52006-05-27 19:21:47 +00006280 return PyUnicode_FromUnicode(s+i, j-i);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006281}
6282
Guido van Rossumd57fd912000-03-10 22:53:23 +00006283
6284static PyObject *
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006285do_strip(PyUnicodeObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006286{
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006287 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
Martin v. Löwis18e16552006-02-15 17:27:45 +00006288 Py_ssize_t len = PyUnicode_GET_SIZE(self), i, j;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006289
6290 i = 0;
6291 if (striptype != RIGHTSTRIP) {
6292 while (i < len && Py_UNICODE_ISSPACE(s[i])) {
6293 i++;
6294 }
6295 }
6296
6297 j = len;
6298 if (striptype != LEFTSTRIP) {
6299 do {
6300 j--;
6301 } while (j >= i && Py_UNICODE_ISSPACE(s[j]));
6302 j++;
6303 }
6304
6305 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
6306 Py_INCREF(self);
6307 return (PyObject*)self;
6308 }
6309 else
6310 return PyUnicode_FromUnicode(s+i, j-i);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006311}
6312
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006313
6314static PyObject *
6315do_argstrip(PyUnicodeObject *self, int striptype, PyObject *args)
6316{
6317 PyObject *sep = NULL;
6318
6319 if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
6320 return NULL;
6321
6322 if (sep != NULL && sep != Py_None) {
6323 if (PyUnicode_Check(sep))
6324 return _PyUnicode_XStrip(self, striptype, sep);
6325 else if (PyString_Check(sep)) {
6326 PyObject *res;
6327 sep = PyUnicode_FromObject(sep);
6328 if (sep==NULL)
6329 return NULL;
6330 res = _PyUnicode_XStrip(self, striptype, sep);
6331 Py_DECREF(sep);
6332 return res;
6333 }
6334 else {
6335 PyErr_Format(PyExc_TypeError,
6336 "%s arg must be None, unicode or str",
6337 STRIPNAME(striptype));
6338 return NULL;
6339 }
6340 }
6341
6342 return do_strip(self, striptype);
6343}
6344
6345
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006346PyDoc_STRVAR(strip__doc__,
Neal Norwitzffe33b72003-04-10 22:35:32 +00006347"S.strip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006348\n\
6349Return a copy of the string S with leading and trailing\n\
6350whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00006351If chars is given and not None, remove characters in chars instead.\n\
6352If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006353
6354static PyObject *
6355unicode_strip(PyUnicodeObject *self, PyObject *args)
6356{
6357 if (PyTuple_GET_SIZE(args) == 0)
6358 return do_strip(self, BOTHSTRIP); /* Common case */
6359 else
6360 return do_argstrip(self, BOTHSTRIP, args);
6361}
6362
6363
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006364PyDoc_STRVAR(lstrip__doc__,
Neal Norwitzffe33b72003-04-10 22:35:32 +00006365"S.lstrip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006366\n\
6367Return a copy of the string S with leading whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00006368If chars is given and not None, remove characters in chars instead.\n\
6369If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006370
6371static PyObject *
6372unicode_lstrip(PyUnicodeObject *self, PyObject *args)
6373{
6374 if (PyTuple_GET_SIZE(args) == 0)
6375 return do_strip(self, LEFTSTRIP); /* Common case */
6376 else
6377 return do_argstrip(self, LEFTSTRIP, args);
6378}
6379
6380
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006381PyDoc_STRVAR(rstrip__doc__,
Neal Norwitzffe33b72003-04-10 22:35:32 +00006382"S.rstrip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006383\n\
6384Return a copy of the string S with trailing whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00006385If chars is given and not None, remove characters in chars instead.\n\
6386If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006387
6388static PyObject *
6389unicode_rstrip(PyUnicodeObject *self, PyObject *args)
6390{
6391 if (PyTuple_GET_SIZE(args) == 0)
6392 return do_strip(self, RIGHTSTRIP); /* Common case */
6393 else
6394 return do_argstrip(self, RIGHTSTRIP, args);
6395}
6396
6397
Guido van Rossumd57fd912000-03-10 22:53:23 +00006398static PyObject*
Martin v. Löwis18e16552006-02-15 17:27:45 +00006399unicode_repeat(PyUnicodeObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006400{
6401 PyUnicodeObject *u;
6402 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006403 Py_ssize_t nchars;
Tim Peters8f422462000-09-09 06:13:41 +00006404 size_t nbytes;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006405
6406 if (len < 0)
6407 len = 0;
6408
Tim Peters7a29bd52001-09-12 03:03:31 +00006409 if (len == 1 && PyUnicode_CheckExact(str)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006410 /* no repeat, return original string */
6411 Py_INCREF(str);
6412 return (PyObject*) str;
6413 }
Tim Peters8f422462000-09-09 06:13:41 +00006414
6415 /* ensure # of chars needed doesn't overflow int and # of bytes
6416 * needed doesn't overflow size_t
6417 */
6418 nchars = len * str->length;
6419 if (len && nchars / len != str->length) {
6420 PyErr_SetString(PyExc_OverflowError,
6421 "repeated string is too long");
6422 return NULL;
6423 }
6424 nbytes = (nchars + 1) * sizeof(Py_UNICODE);
6425 if (nbytes / sizeof(Py_UNICODE) != (size_t)(nchars + 1)) {
6426 PyErr_SetString(PyExc_OverflowError,
6427 "repeated string is too long");
6428 return NULL;
6429 }
6430 u = _PyUnicode_New(nchars);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006431 if (!u)
6432 return NULL;
6433
6434 p = u->str;
6435
Thomas Wouters477c8d52006-05-27 19:21:47 +00006436 if (str->length == 1 && len > 0) {
6437 Py_UNICODE_FILL(p, str->str[0], len);
6438 } else {
6439 Py_ssize_t done = 0; /* number of characters copied this far */
6440 if (done < nchars) {
6441 Py_UNICODE_COPY(p, str->str, str->length);
6442 done = str->length;
6443 }
6444 while (done < nchars) {
6445 int n = (done <= nchars-done) ? done : nchars-done;
6446 Py_UNICODE_COPY(p+done, p, n);
6447 done += n;
6448 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006449 }
6450
6451 return (PyObject*) u;
6452}
6453
6454PyObject *PyUnicode_Replace(PyObject *obj,
6455 PyObject *subobj,
6456 PyObject *replobj,
Martin v. Löwis18e16552006-02-15 17:27:45 +00006457 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006458{
6459 PyObject *self;
6460 PyObject *str1;
6461 PyObject *str2;
6462 PyObject *result;
6463
6464 self = PyUnicode_FromObject(obj);
6465 if (self == NULL)
6466 return NULL;
6467 str1 = PyUnicode_FromObject(subobj);
6468 if (str1 == NULL) {
6469 Py_DECREF(self);
6470 return NULL;
6471 }
6472 str2 = PyUnicode_FromObject(replobj);
6473 if (str2 == NULL) {
6474 Py_DECREF(self);
6475 Py_DECREF(str1);
6476 return NULL;
6477 }
Tim Petersced69f82003-09-16 20:30:58 +00006478 result = replace((PyUnicodeObject *)self,
6479 (PyUnicodeObject *)str1,
6480 (PyUnicodeObject *)str2,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006481 maxcount);
6482 Py_DECREF(self);
6483 Py_DECREF(str1);
6484 Py_DECREF(str2);
6485 return result;
6486}
6487
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006488PyDoc_STRVAR(replace__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006489"S.replace (old, new[, maxsplit]) -> unicode\n\
6490\n\
6491Return a copy of S with all occurrences of substring\n\
6492old replaced by new. If the optional argument maxsplit is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006493given, only the first maxsplit occurrences are replaced.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006494
6495static PyObject*
6496unicode_replace(PyUnicodeObject *self, PyObject *args)
6497{
6498 PyUnicodeObject *str1;
6499 PyUnicodeObject *str2;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006500 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006501 PyObject *result;
6502
Martin v. Löwis18e16552006-02-15 17:27:45 +00006503 if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006504 return NULL;
6505 str1 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str1);
6506 if (str1 == NULL)
6507 return NULL;
6508 str2 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str2);
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00006509 if (str2 == NULL) {
6510 Py_DECREF(str1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006511 return NULL;
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00006512 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006513
6514 result = replace(self, str1, str2, maxcount);
6515
6516 Py_DECREF(str1);
6517 Py_DECREF(str2);
6518 return result;
6519}
6520
6521static
6522PyObject *unicode_repr(PyObject *unicode)
6523{
6524 return unicodeescape_string(PyUnicode_AS_UNICODE(unicode),
6525 PyUnicode_GET_SIZE(unicode),
6526 1);
6527}
6528
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006529PyDoc_STRVAR(rfind__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006530"S.rfind(sub [,start [,end]]) -> int\n\
6531\n\
6532Return the highest index in S where substring sub is found,\n\
6533such that sub is contained within s[start,end]. Optional\n\
6534arguments start and end are interpreted as in slice notation.\n\
6535\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006536Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006537
6538static PyObject *
6539unicode_rfind(PyUnicodeObject *self, PyObject *args)
6540{
Thomas Wouters477c8d52006-05-27 19:21:47 +00006541 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006542 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006543 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006544 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006545
Guido van Rossumb8872e62000-05-09 14:14:27 +00006546 if (!PyArg_ParseTuple(args, "O|O&O&:rfind", &substring,
6547 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006548 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006549 substring = PyUnicode_FromObject(substring);
6550 if (!substring)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006551 return NULL;
6552
Thomas Wouters477c8d52006-05-27 19:21:47 +00006553 result = stringlib_rfind_slice(
6554 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
6555 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
6556 start, end
6557 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006558
6559 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00006560
6561 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006562}
6563
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006564PyDoc_STRVAR(rindex__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006565"S.rindex(sub [,start [,end]]) -> int\n\
6566\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006567Like S.rfind() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006568
6569static PyObject *
6570unicode_rindex(PyUnicodeObject *self, PyObject *args)
6571{
Thomas Wouters477c8d52006-05-27 19:21:47 +00006572 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006573 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006574 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006575 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006576
Guido van Rossumb8872e62000-05-09 14:14:27 +00006577 if (!PyArg_ParseTuple(args, "O|O&O&:rindex", &substring,
6578 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006579 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006580 substring = PyUnicode_FromObject(substring);
6581 if (!substring)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006582 return NULL;
6583
Thomas Wouters477c8d52006-05-27 19:21:47 +00006584 result = stringlib_rfind_slice(
6585 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
6586 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
6587 start, end
6588 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006589
6590 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00006591
Guido van Rossumd57fd912000-03-10 22:53:23 +00006592 if (result < 0) {
6593 PyErr_SetString(PyExc_ValueError, "substring not found");
6594 return NULL;
6595 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00006596 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006597}
6598
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006599PyDoc_STRVAR(rjust__doc__,
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006600"S.rjust(width[, fillchar]) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006601\n\
6602Return S right justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006603done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006604
6605static PyObject *
6606unicode_rjust(PyUnicodeObject *self, PyObject *args)
6607{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006608 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006609 Py_UNICODE fillchar = ' ';
6610
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006611 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006612 return NULL;
6613
Tim Peters7a29bd52001-09-12 03:03:31 +00006614 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006615 Py_INCREF(self);
6616 return (PyObject*) self;
6617 }
6618
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006619 return (PyObject*) pad(self, width - self->length, 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006620}
6621
Guido van Rossumd57fd912000-03-10 22:53:23 +00006622static PyObject*
Martin v. Löwis18e16552006-02-15 17:27:45 +00006623unicode_slice(PyUnicodeObject *self, Py_ssize_t start, Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006624{
6625 /* standard clamping */
6626 if (start < 0)
6627 start = 0;
6628 if (end < 0)
6629 end = 0;
6630 if (end > self->length)
6631 end = self->length;
Tim Peters7a29bd52001-09-12 03:03:31 +00006632 if (start == 0 && end == self->length && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006633 /* full slice, return original string */
6634 Py_INCREF(self);
6635 return (PyObject*) self;
6636 }
6637 if (start > end)
6638 start = end;
6639 /* copy slice */
6640 return (PyObject*) PyUnicode_FromUnicode(self->str + start,
6641 end - start);
6642}
6643
6644PyObject *PyUnicode_Split(PyObject *s,
6645 PyObject *sep,
Martin v. Löwis18e16552006-02-15 17:27:45 +00006646 Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006647{
6648 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00006649
Guido van Rossumd57fd912000-03-10 22:53:23 +00006650 s = PyUnicode_FromObject(s);
6651 if (s == NULL)
6652 return NULL;
6653 if (sep != NULL) {
6654 sep = PyUnicode_FromObject(sep);
6655 if (sep == NULL) {
6656 Py_DECREF(s);
6657 return NULL;
6658 }
6659 }
6660
6661 result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
6662
6663 Py_DECREF(s);
6664 Py_XDECREF(sep);
6665 return result;
6666}
6667
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006668PyDoc_STRVAR(split__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006669"S.split([sep [,maxsplit]]) -> list of strings\n\
6670\n\
6671Return a list of the words in S, using sep as the\n\
6672delimiter string. If maxsplit is given, at most maxsplit\n\
Thomas Hellerca0d2cb2004-09-15 11:41:32 +00006673splits are done. If sep is not specified or is None,\n\
Walter Dörwald782afc52004-09-14 09:40:45 +00006674any whitespace string is a separator.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006675
6676static PyObject*
6677unicode_split(PyUnicodeObject *self, PyObject *args)
6678{
6679 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006680 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006681
Martin v. Löwis18e16552006-02-15 17:27:45 +00006682 if (!PyArg_ParseTuple(args, "|On:split", &substring, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006683 return NULL;
6684
6685 if (substring == Py_None)
6686 return split(self, NULL, maxcount);
6687 else if (PyUnicode_Check(substring))
6688 return split(self, (PyUnicodeObject *)substring, maxcount);
6689 else
6690 return PyUnicode_Split((PyObject *)self, substring, maxcount);
6691}
6692
Thomas Wouters477c8d52006-05-27 19:21:47 +00006693PyObject *
6694PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
6695{
6696 PyObject* str_obj;
6697 PyObject* sep_obj;
6698 PyObject* out;
6699
6700 str_obj = PyUnicode_FromObject(str_in);
6701 if (!str_obj)
6702 return NULL;
6703 sep_obj = PyUnicode_FromObject(sep_in);
6704 if (!sep_obj) {
6705 Py_DECREF(str_obj);
6706 return NULL;
6707 }
6708
6709 out = stringlib_partition(
6710 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
6711 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
6712 );
6713
6714 Py_DECREF(sep_obj);
6715 Py_DECREF(str_obj);
6716
6717 return out;
6718}
6719
6720
6721PyObject *
6722PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
6723{
6724 PyObject* str_obj;
6725 PyObject* sep_obj;
6726 PyObject* out;
6727
6728 str_obj = PyUnicode_FromObject(str_in);
6729 if (!str_obj)
6730 return NULL;
6731 sep_obj = PyUnicode_FromObject(sep_in);
6732 if (!sep_obj) {
6733 Py_DECREF(str_obj);
6734 return NULL;
6735 }
6736
6737 out = stringlib_rpartition(
6738 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
6739 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
6740 );
6741
6742 Py_DECREF(sep_obj);
6743 Py_DECREF(str_obj);
6744
6745 return out;
6746}
6747
6748PyDoc_STRVAR(partition__doc__,
6749"S.partition(sep) -> (head, sep, tail)\n\
6750\n\
6751Searches for the separator sep in S, and returns the part before it,\n\
6752the separator itself, and the part after it. If the separator is not\n\
6753found, returns S and two empty strings.");
6754
6755static PyObject*
6756unicode_partition(PyUnicodeObject *self, PyObject *separator)
6757{
6758 return PyUnicode_Partition((PyObject *)self, separator);
6759}
6760
6761PyDoc_STRVAR(rpartition__doc__,
Thomas Wouters89f507f2006-12-13 04:49:30 +00006762"S.rpartition(sep) -> (tail, sep, head)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00006763\n\
6764Searches for the separator sep in S, starting at the end of S, and returns\n\
6765the part before it, the separator itself, and the part after it. If the\n\
Thomas Wouters89f507f2006-12-13 04:49:30 +00006766separator is not found, returns two empty strings and S.");
Thomas Wouters477c8d52006-05-27 19:21:47 +00006767
6768static PyObject*
6769unicode_rpartition(PyUnicodeObject *self, PyObject *separator)
6770{
6771 return PyUnicode_RPartition((PyObject *)self, separator);
6772}
6773
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006774PyObject *PyUnicode_RSplit(PyObject *s,
6775 PyObject *sep,
Martin v. Löwis18e16552006-02-15 17:27:45 +00006776 Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006777{
6778 PyObject *result;
6779
6780 s = PyUnicode_FromObject(s);
6781 if (s == NULL)
6782 return NULL;
6783 if (sep != NULL) {
6784 sep = PyUnicode_FromObject(sep);
6785 if (sep == NULL) {
6786 Py_DECREF(s);
6787 return NULL;
6788 }
6789 }
6790
6791 result = rsplit((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
6792
6793 Py_DECREF(s);
6794 Py_XDECREF(sep);
6795 return result;
6796}
6797
6798PyDoc_STRVAR(rsplit__doc__,
6799"S.rsplit([sep [,maxsplit]]) -> list of strings\n\
6800\n\
6801Return a list of the words in S, using sep as the\n\
6802delimiter string, starting at the end of the string and\n\
6803working to the front. If maxsplit is given, at most maxsplit\n\
6804splits are done. If sep is not specified, any whitespace string\n\
6805is a separator.");
6806
6807static PyObject*
6808unicode_rsplit(PyUnicodeObject *self, PyObject *args)
6809{
6810 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006811 Py_ssize_t maxcount = -1;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006812
Martin v. Löwis18e16552006-02-15 17:27:45 +00006813 if (!PyArg_ParseTuple(args, "|On:rsplit", &substring, &maxcount))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006814 return NULL;
6815
6816 if (substring == Py_None)
6817 return rsplit(self, NULL, maxcount);
6818 else if (PyUnicode_Check(substring))
6819 return rsplit(self, (PyUnicodeObject *)substring, maxcount);
6820 else
6821 return PyUnicode_RSplit((PyObject *)self, substring, maxcount);
6822}
6823
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006824PyDoc_STRVAR(splitlines__doc__,
Guido van Rossum86662912000-04-11 15:38:46 +00006825"S.splitlines([keepends]]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006826\n\
6827Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +00006828Line breaks are not included in the resulting list unless keepends\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006829is given and true.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006830
6831static PyObject*
6832unicode_splitlines(PyUnicodeObject *self, PyObject *args)
6833{
Guido van Rossum86662912000-04-11 15:38:46 +00006834 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006835
Guido van Rossum86662912000-04-11 15:38:46 +00006836 if (!PyArg_ParseTuple(args, "|i:splitlines", &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006837 return NULL;
6838
Guido van Rossum86662912000-04-11 15:38:46 +00006839 return PyUnicode_Splitlines((PyObject *)self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006840}
6841
6842static
Guido van Rossumf15a29f2007-05-04 00:41:39 +00006843PyObject *unicode_str(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006844{
Guido van Rossumf15a29f2007-05-04 00:41:39 +00006845 PyObject *res = _PyUnicode_AsDefaultEncodedString(self, NULL);
6846 Py_XINCREF(res);
6847 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006848}
6849
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006850PyDoc_STRVAR(swapcase__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006851"S.swapcase() -> unicode\n\
6852\n\
6853Return a copy of S with uppercase characters converted to lowercase\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006854and vice versa.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006855
6856static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006857unicode_swapcase(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006858{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006859 return fixup(self, fixswapcase);
6860}
6861
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006862PyDoc_STRVAR(translate__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006863"S.translate(table) -> unicode\n\
6864\n\
6865Return a copy of the string S, where all characters have been mapped\n\
6866through the given translation table, which must be a mapping of\n\
Walter Dörwald5c1ee172002-09-04 20:31:32 +00006867Unicode ordinals to Unicode ordinals, Unicode strings or None.\n\
6868Unmapped characters are left untouched. Characters mapped to None\n\
6869are deleted.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006870
6871static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006872unicode_translate(PyUnicodeObject *self, PyObject *table)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006873{
Tim Petersced69f82003-09-16 20:30:58 +00006874 return PyUnicode_TranslateCharmap(self->str,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006875 self->length,
Tim Petersced69f82003-09-16 20:30:58 +00006876 table,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006877 "ignore");
6878}
6879
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006880PyDoc_STRVAR(upper__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006881"S.upper() -> unicode\n\
6882\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006883Return a copy of S converted to uppercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006884
6885static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006886unicode_upper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006887{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006888 return fixup(self, fixupper);
6889}
6890
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006891PyDoc_STRVAR(zfill__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006892"S.zfill(width) -> unicode\n\
6893\n\
6894Pad a numeric string x with zeros on the left, to fill a field\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006895of the specified width. The string x is never truncated.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006896
6897static PyObject *
6898unicode_zfill(PyUnicodeObject *self, PyObject *args)
6899{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006900 Py_ssize_t fill;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006901 PyUnicodeObject *u;
6902
Martin v. Löwis18e16552006-02-15 17:27:45 +00006903 Py_ssize_t width;
6904 if (!PyArg_ParseTuple(args, "n:zfill", &width))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006905 return NULL;
6906
6907 if (self->length >= width) {
Walter Dörwald0fe940c2002-04-15 18:42:15 +00006908 if (PyUnicode_CheckExact(self)) {
6909 Py_INCREF(self);
6910 return (PyObject*) self;
6911 }
6912 else
6913 return PyUnicode_FromUnicode(
6914 PyUnicode_AS_UNICODE(self),
6915 PyUnicode_GET_SIZE(self)
6916 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006917 }
6918
6919 fill = width - self->length;
6920
6921 u = pad(self, fill, 0, '0');
6922
Walter Dörwald068325e2002-04-15 13:36:47 +00006923 if (u == NULL)
6924 return NULL;
6925
Guido van Rossumd57fd912000-03-10 22:53:23 +00006926 if (u->str[fill] == '+' || u->str[fill] == '-') {
6927 /* move sign to beginning of string */
6928 u->str[0] = u->str[fill];
6929 u->str[fill] = '0';
6930 }
6931
6932 return (PyObject*) u;
6933}
Guido van Rossumd57fd912000-03-10 22:53:23 +00006934
6935#if 0
6936static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006937unicode_freelistsize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006938{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006939 return PyInt_FromLong(unicode_freelist_size);
6940}
6941#endif
6942
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006943PyDoc_STRVAR(startswith__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006944"S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006945\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00006946Return True if S starts with the specified prefix, False otherwise.\n\
6947With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006948With optional end, stop comparing S at that position.\n\
6949prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006950
6951static PyObject *
6952unicode_startswith(PyUnicodeObject *self,
6953 PyObject *args)
6954{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006955 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006956 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006957 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006958 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006959 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006960
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006961 if (!PyArg_ParseTuple(args, "O|O&O&:startswith", &subobj,
Guido van Rossumb8872e62000-05-09 14:14:27 +00006962 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006963 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006964 if (PyTuple_Check(subobj)) {
6965 Py_ssize_t i;
6966 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
6967 substring = (PyUnicodeObject *)PyUnicode_FromObject(
6968 PyTuple_GET_ITEM(subobj, i));
6969 if (substring == NULL)
6970 return NULL;
6971 result = tailmatch(self, substring, start, end, -1);
6972 Py_DECREF(substring);
6973 if (result) {
6974 Py_RETURN_TRUE;
6975 }
6976 }
6977 /* nothing matched */
6978 Py_RETURN_FALSE;
6979 }
6980 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006981 if (substring == NULL)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006982 return NULL;
6983 result = tailmatch(self, substring, start, end, -1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006984 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006985 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006986}
6987
6988
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006989PyDoc_STRVAR(endswith__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006990"S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006991\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00006992Return True if S ends with the specified suffix, False otherwise.\n\
6993With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006994With optional end, stop comparing S at that position.\n\
6995suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006996
6997static PyObject *
6998unicode_endswith(PyUnicodeObject *self,
6999 PyObject *args)
7000{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007001 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007002 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007003 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007004 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007005 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007006
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007007 if (!PyArg_ParseTuple(args, "O|O&O&:endswith", &subobj,
7008 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007009 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007010 if (PyTuple_Check(subobj)) {
7011 Py_ssize_t i;
7012 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
7013 substring = (PyUnicodeObject *)PyUnicode_FromObject(
7014 PyTuple_GET_ITEM(subobj, i));
7015 if (substring == NULL)
7016 return NULL;
7017 result = tailmatch(self, substring, start, end, +1);
7018 Py_DECREF(substring);
7019 if (result) {
7020 Py_RETURN_TRUE;
7021 }
7022 }
7023 Py_RETURN_FALSE;
7024 }
7025 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007026 if (substring == NULL)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007027 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007028
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007029 result = tailmatch(self, substring, start, end, +1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007030 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007031 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007032}
7033
7034
Guido van Rossum5d9113d2003-01-29 17:58:45 +00007035
7036static PyObject *
7037unicode_getnewargs(PyUnicodeObject *v)
7038{
7039 return Py_BuildValue("(u#)", v->str, v->length);
7040}
7041
7042
Guido van Rossumd57fd912000-03-10 22:53:23 +00007043static PyMethodDef unicode_methods[] = {
7044
7045 /* Order is according to common usage: often used methods should
7046 appear first, since lookup is done sequentially. */
7047
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007048 {"encode", (PyCFunction) unicode_encode, METH_VARARGS, encode__doc__},
7049 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
7050 {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007051 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007052 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
7053 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
7054 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
7055 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
7056 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
7057 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
7058 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +00007059 {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007060 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
7061 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
7062 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007063 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00007064 {"decode", (PyCFunction) unicode_decode, METH_VARARGS, decode__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007065/* {"maketrans", (PyCFunction) unicode_maketrans, METH_VARARGS, maketrans__doc__}, */
7066 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
7067 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
7068 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007069 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +00007070 {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007071 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS, splitlines__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007072 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007073 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
7074 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
7075 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
7076 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
7077 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
7078 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
7079 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
7080 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
7081 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
7082 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
7083 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
7084 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
7085 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
7086 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007087 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
Walter Dörwald068325e2002-04-15 13:36:47 +00007088#if 0
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007089 {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
Guido van Rossumd57fd912000-03-10 22:53:23 +00007090#endif
7091
7092#if 0
7093 /* This one is just used for debugging the implementation. */
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007094 {"freelistsize", (PyCFunction) unicode_freelistsize, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00007095#endif
7096
Guido van Rossum5d9113d2003-01-29 17:58:45 +00007097 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00007098 {NULL, NULL}
7099};
7100
Neil Schemenauerce30bc92002-11-18 16:10:18 +00007101static PyObject *
7102unicode_mod(PyObject *v, PyObject *w)
7103{
7104 if (!PyUnicode_Check(v)) {
7105 Py_INCREF(Py_NotImplemented);
7106 return Py_NotImplemented;
7107 }
7108 return PyUnicode_Format(v, w);
7109}
7110
7111static PyNumberMethods unicode_as_number = {
7112 0, /*nb_add*/
7113 0, /*nb_subtract*/
7114 0, /*nb_multiply*/
Neil Schemenauerce30bc92002-11-18 16:10:18 +00007115 unicode_mod, /*nb_remainder*/
7116};
7117
Guido van Rossumd57fd912000-03-10 22:53:23 +00007118static PySequenceMethods unicode_as_sequence = {
Martin v. Löwis18e16552006-02-15 17:27:45 +00007119 (lenfunc) unicode_length, /* sq_length */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007120 PyUnicode_Concat, /* sq_concat */
Martin v. Löwis18e16552006-02-15 17:27:45 +00007121 (ssizeargfunc) unicode_repeat, /* sq_repeat */
7122 (ssizeargfunc) unicode_getitem, /* sq_item */
7123 (ssizessizeargfunc) unicode_slice, /* sq_slice */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007124 0, /* sq_ass_item */
7125 0, /* sq_ass_slice */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007126 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007127};
7128
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007129static PyObject*
7130unicode_subscript(PyUnicodeObject* self, PyObject* item)
7131{
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00007132 if (PyIndex_Check(item)) {
7133 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007134 if (i == -1 && PyErr_Occurred())
7135 return NULL;
7136 if (i < 0)
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007137 i += PyUnicode_GET_SIZE(self);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007138 return unicode_getitem(self, i);
7139 } else if (PySlice_Check(item)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00007140 Py_ssize_t start, stop, step, slicelength, cur, i;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007141 Py_UNICODE* source_buf;
7142 Py_UNICODE* result_buf;
7143 PyObject* result;
7144
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007145 if (PySlice_GetIndicesEx((PySliceObject*)item, PyUnicode_GET_SIZE(self),
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007146 &start, &stop, &step, &slicelength) < 0) {
7147 return NULL;
7148 }
7149
7150 if (slicelength <= 0) {
7151 return PyUnicode_FromUnicode(NULL, 0);
7152 } else {
7153 source_buf = PyUnicode_AS_UNICODE((PyObject*)self);
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007154 result_buf = (Py_UNICODE *)PyMem_MALLOC(slicelength*
7155 sizeof(Py_UNICODE));
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007156
7157 if (result_buf == NULL)
7158 return PyErr_NoMemory();
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007159
7160 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
7161 result_buf[i] = source_buf[cur];
7162 }
Tim Petersced69f82003-09-16 20:30:58 +00007163
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007164 result = PyUnicode_FromUnicode(result_buf, slicelength);
7165 PyMem_FREE(result_buf);
7166 return result;
7167 }
7168 } else {
7169 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
7170 return NULL;
7171 }
7172}
7173
7174static PyMappingMethods unicode_as_mapping = {
Martin v. Löwis18e16552006-02-15 17:27:45 +00007175 (lenfunc)unicode_length, /* mp_length */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007176 (binaryfunc)unicode_subscript, /* mp_subscript */
7177 (objobjargproc)0, /* mp_ass_subscript */
7178};
7179
Martin v. Löwis18e16552006-02-15 17:27:45 +00007180static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +00007181unicode_buffer_getreadbuf(PyUnicodeObject *self,
Martin v. Löwis18e16552006-02-15 17:27:45 +00007182 Py_ssize_t index,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007183 const void **ptr)
7184{
7185 if (index != 0) {
7186 PyErr_SetString(PyExc_SystemError,
7187 "accessing non-existent unicode segment");
7188 return -1;
7189 }
7190 *ptr = (void *) self->str;
7191 return PyUnicode_GET_DATA_SIZE(self);
7192}
7193
Martin v. Löwis18e16552006-02-15 17:27:45 +00007194static Py_ssize_t
7195unicode_buffer_getwritebuf(PyUnicodeObject *self, Py_ssize_t index,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007196 const void **ptr)
7197{
7198 PyErr_SetString(PyExc_TypeError,
Neal Norwitz20e72132002-06-13 21:25:17 +00007199 "cannot use unicode as modifiable buffer");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007200 return -1;
7201}
7202
7203static int
7204unicode_buffer_getsegcount(PyUnicodeObject *self,
Martin v. Löwis18e16552006-02-15 17:27:45 +00007205 Py_ssize_t *lenp)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007206{
7207 if (lenp)
7208 *lenp = PyUnicode_GET_DATA_SIZE(self);
7209 return 1;
7210}
7211
Martin v. Löwiseb079f12006-02-16 14:32:27 +00007212static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +00007213unicode_buffer_getcharbuf(PyUnicodeObject *self,
Martin v. Löwis18e16552006-02-15 17:27:45 +00007214 Py_ssize_t index,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007215 const void **ptr)
7216{
7217 PyObject *str;
Tim Petersced69f82003-09-16 20:30:58 +00007218
Guido van Rossumd57fd912000-03-10 22:53:23 +00007219 if (index != 0) {
7220 PyErr_SetString(PyExc_SystemError,
7221 "accessing non-existent unicode segment");
7222 return -1;
7223 }
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00007224 str = _PyUnicode_AsDefaultEncodedString((PyObject *)self, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007225 if (str == NULL)
7226 return -1;
7227 *ptr = (void *) PyString_AS_STRING(str);
7228 return PyString_GET_SIZE(str);
7229}
7230
7231/* Helpers for PyUnicode_Format() */
7232
7233static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +00007234getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007235{
Martin v. Löwis18e16552006-02-15 17:27:45 +00007236 Py_ssize_t argidx = *p_argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007237 if (argidx < arglen) {
7238 (*p_argidx)++;
7239 if (arglen < 0)
7240 return args;
7241 else
7242 return PyTuple_GetItem(args, argidx);
7243 }
7244 PyErr_SetString(PyExc_TypeError,
7245 "not enough arguments for format string");
7246 return NULL;
7247}
7248
7249#define F_LJUST (1<<0)
7250#define F_SIGN (1<<1)
7251#define F_BLANK (1<<2)
7252#define F_ALT (1<<3)
7253#define F_ZERO (1<<4)
7254
Martin v. Löwis18e16552006-02-15 17:27:45 +00007255static Py_ssize_t
Neal Norwitzfc76d632006-01-10 06:03:13 +00007256strtounicode(Py_UNICODE *buffer, const char *charbuffer)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007257{
Martin v. Löwis18e16552006-02-15 17:27:45 +00007258 register Py_ssize_t i;
7259 Py_ssize_t len = strlen(charbuffer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007260 for (i = len - 1; i >= 0; i--)
7261 buffer[i] = (Py_UNICODE) charbuffer[i];
7262
Guido van Rossumd57fd912000-03-10 22:53:23 +00007263 return len;
7264}
7265
Neal Norwitzfc76d632006-01-10 06:03:13 +00007266static int
7267doubletounicode(Py_UNICODE *buffer, size_t len, const char *format, double x)
7268{
Tim Peters15231542006-02-16 01:08:01 +00007269 Py_ssize_t result;
7270
Neal Norwitzfc76d632006-01-10 06:03:13 +00007271 PyOS_ascii_formatd((char *)buffer, len, format, x);
Tim Peters15231542006-02-16 01:08:01 +00007272 result = strtounicode(buffer, (char *)buffer);
7273 return Py_SAFE_DOWNCAST(result, Py_ssize_t, int);
Neal Norwitzfc76d632006-01-10 06:03:13 +00007274}
7275
7276static int
7277longtounicode(Py_UNICODE *buffer, size_t len, const char *format, long x)
7278{
Tim Peters15231542006-02-16 01:08:01 +00007279 Py_ssize_t result;
7280
Neal Norwitzfc76d632006-01-10 06:03:13 +00007281 PyOS_snprintf((char *)buffer, len, format, x);
Tim Peters15231542006-02-16 01:08:01 +00007282 result = strtounicode(buffer, (char *)buffer);
7283 return Py_SAFE_DOWNCAST(result, Py_ssize_t, int);
Neal Norwitzfc76d632006-01-10 06:03:13 +00007284}
7285
Guido van Rossum078151d2002-08-11 04:24:12 +00007286/* XXX To save some code duplication, formatfloat/long/int could have been
7287 shared with stringobject.c, converting from 8-bit to Unicode after the
7288 formatting is done. */
7289
Guido van Rossumd57fd912000-03-10 22:53:23 +00007290static int
7291formatfloat(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007292 size_t buflen,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007293 int flags,
7294 int prec,
7295 int type,
7296 PyObject *v)
7297{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007298 /* fmt = '%#.' + `prec` + `type`
7299 worst case length = 3 + 10 (len of INT_MAX) + 1 = 14 (use 20)*/
Guido van Rossumd57fd912000-03-10 22:53:23 +00007300 char fmt[20];
7301 double x;
Tim Petersced69f82003-09-16 20:30:58 +00007302
Guido van Rossumd57fd912000-03-10 22:53:23 +00007303 x = PyFloat_AsDouble(v);
7304 if (x == -1.0 && PyErr_Occurred())
7305 return -1;
7306 if (prec < 0)
7307 prec = 6;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007308 if (type == 'f' && (fabs(x) / 1e25) >= 1e25)
7309 type = 'g';
Marc-André Lemburg79f57832002-12-29 19:44:06 +00007310 /* Worst case length calc to ensure no buffer overrun:
7311
7312 'g' formats:
7313 fmt = %#.<prec>g
7314 buf = '-' + [0-9]*prec + '.' + 'e+' + (longest exp
7315 for any double rep.)
7316 len = 1 + prec + 1 + 2 + 5 = 9 + prec
7317
7318 'f' formats:
7319 buf = '-' + [0-9]*x + '.' + [0-9]*prec (with x < 50)
7320 len = 1 + 50 + 1 + prec = 52 + prec
7321
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007322 If prec=0 the effective precision is 1 (the leading digit is
Tim Petersced69f82003-09-16 20:30:58 +00007323 always given), therefore increase the length by one.
Marc-André Lemburg79f57832002-12-29 19:44:06 +00007324
7325 */
7326 if ((type == 'g' && buflen <= (size_t)10 + (size_t)prec) ||
7327 (type == 'f' && buflen <= (size_t)53 + (size_t)prec)) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007328 PyErr_SetString(PyExc_OverflowError,
Marc-André Lemburg79f57832002-12-29 19:44:06 +00007329 "formatted float is too long (precision too large?)");
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007330 return -1;
7331 }
Marc-André Lemburg79f57832002-12-29 19:44:06 +00007332 PyOS_snprintf(fmt, sizeof(fmt), "%%%s.%d%c",
7333 (flags&F_ALT) ? "#" : "",
7334 prec, type);
Neal Norwitzfc76d632006-01-10 06:03:13 +00007335 return doubletounicode(buf, buflen, fmt, x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007336}
7337
Tim Peters38fd5b62000-09-21 05:43:11 +00007338static PyObject*
7339formatlong(PyObject *val, int flags, int prec, int type)
7340{
7341 char *buf;
7342 int i, len;
7343 PyObject *str; /* temporary string object. */
7344 PyUnicodeObject *result;
7345
7346 str = _PyString_FormatLong(val, flags, prec, type, &buf, &len);
7347 if (!str)
7348 return NULL;
7349 result = _PyUnicode_New(len);
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00007350 if (!result) {
7351 Py_DECREF(str);
7352 return NULL;
7353 }
Tim Peters38fd5b62000-09-21 05:43:11 +00007354 for (i = 0; i < len; i++)
7355 result->str[i] = buf[i];
7356 result->str[len] = 0;
7357 Py_DECREF(str);
7358 return (PyObject*)result;
7359}
7360
Guido van Rossumd57fd912000-03-10 22:53:23 +00007361static int
7362formatint(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007363 size_t buflen,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007364 int flags,
7365 int prec,
7366 int type,
7367 PyObject *v)
7368{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007369 /* fmt = '%#.' + `prec` + 'l' + `type`
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00007370 * worst case length = 3 + 19 (worst len of INT_MAX on 64-bit machine)
7371 * + 1 + 1
7372 * = 24
7373 */
Tim Peters38fd5b62000-09-21 05:43:11 +00007374 char fmt[64]; /* plenty big enough! */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00007375 char *sign;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007376 long x;
7377
7378 x = PyInt_AsLong(v);
7379 if (x == -1 && PyErr_Occurred())
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00007380 return -1;
Guido van Rossum6c9e1302003-11-29 23:52:13 +00007381 if (x < 0 && type == 'u') {
7382 type = 'd';
Guido van Rossum078151d2002-08-11 04:24:12 +00007383 }
Guido van Rossum6c9e1302003-11-29 23:52:13 +00007384 if (x < 0 && (type == 'x' || type == 'X' || type == 'o'))
7385 sign = "-";
7386 else
7387 sign = "";
Guido van Rossumd57fd912000-03-10 22:53:23 +00007388 if (prec < 0)
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00007389 prec = 1;
7390
Guido van Rossum6c9e1302003-11-29 23:52:13 +00007391 /* buf = '+'/'-'/'' + '0'/'0x'/'' + '[0-9]'*max(prec, len(x in octal))
7392 * worst case buf = '-0x' + [0-9]*prec, where prec >= 11
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00007393 */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00007394 if (buflen <= 14 || buflen <= (size_t)3 + (size_t)prec) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007395 PyErr_SetString(PyExc_OverflowError,
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00007396 "formatted integer is too long (precision too large?)");
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007397 return -1;
7398 }
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00007399
7400 if ((flags & F_ALT) &&
7401 (type == 'x' || type == 'X')) {
Tim Petersced69f82003-09-16 20:30:58 +00007402 /* When converting under %#x or %#X, there are a number
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00007403 * of issues that cause pain:
7404 * - when 0 is being converted, the C standard leaves off
7405 * the '0x' or '0X', which is inconsistent with other
7406 * %#x/%#X conversions and inconsistent with Python's
7407 * hex() function
7408 * - there are platforms that violate the standard and
7409 * convert 0 with the '0x' or '0X'
7410 * (Metrowerks, Compaq Tru64)
7411 * - there are platforms that give '0x' when converting
Tim Petersced69f82003-09-16 20:30:58 +00007412 * under %#X, but convert 0 in accordance with the
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00007413 * standard (OS/2 EMX)
Tim Petersced69f82003-09-16 20:30:58 +00007414 *
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00007415 * We can achieve the desired consistency by inserting our
7416 * own '0x' or '0X' prefix, and substituting %x/%X in place
7417 * of %#x/%#X.
7418 *
7419 * Note that this is the same approach as used in
7420 * formatint() in stringobject.c
Andrew MacIntyrec4874392002-02-26 11:36:35 +00007421 */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00007422 PyOS_snprintf(fmt, sizeof(fmt), "%s0%c%%.%dl%c",
7423 sign, type, prec, type);
Andrew MacIntyrec4874392002-02-26 11:36:35 +00007424 }
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00007425 else {
Guido van Rossum6c9e1302003-11-29 23:52:13 +00007426 PyOS_snprintf(fmt, sizeof(fmt), "%s%%%s.%dl%c",
7427 sign, (flags&F_ALT) ? "#" : "",
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00007428 prec, type);
Tim Petersb3d8d1f2001-04-28 05:38:26 +00007429 }
Guido van Rossum6c9e1302003-11-29 23:52:13 +00007430 if (sign[0])
Neal Norwitzfc76d632006-01-10 06:03:13 +00007431 return longtounicode(buf, buflen, fmt, -x);
Guido van Rossum6c9e1302003-11-29 23:52:13 +00007432 else
Neal Norwitzfc76d632006-01-10 06:03:13 +00007433 return longtounicode(buf, buflen, fmt, x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007434}
7435
7436static int
7437formatchar(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007438 size_t buflen,
7439 PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007440{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007441 /* presume that the buffer is at least 2 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00007442 if (PyUnicode_Check(v)) {
7443 if (PyUnicode_GET_SIZE(v) != 1)
7444 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007445 buf[0] = PyUnicode_AS_UNICODE(v)[0];
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00007446 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007447
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00007448 else if (PyString_Check(v)) {
Tim Petersced69f82003-09-16 20:30:58 +00007449 if (PyString_GET_SIZE(v) != 1)
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00007450 goto onError;
7451 buf[0] = (Py_UNICODE)PyString_AS_STRING(v)[0];
7452 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007453
7454 else {
7455 /* Integer input truncated to a character */
7456 long x;
7457 x = PyInt_AsLong(v);
7458 if (x == -1 && PyErr_Occurred())
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00007459 goto onError;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00007460#ifdef Py_UNICODE_WIDE
7461 if (x < 0 || x > 0x10ffff) {
Walter Dörwald44f527f2003-04-02 16:37:24 +00007462 PyErr_SetString(PyExc_OverflowError,
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00007463 "%c arg not in range(0x110000) "
7464 "(wide Python build)");
7465 return -1;
7466 }
7467#else
7468 if (x < 0 || x > 0xffff) {
Walter Dörwald44f527f2003-04-02 16:37:24 +00007469 PyErr_SetString(PyExc_OverflowError,
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00007470 "%c arg not in range(0x10000) "
7471 "(narrow Python build)");
7472 return -1;
7473 }
7474#endif
7475 buf[0] = (Py_UNICODE) x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007476 }
7477 buf[1] = '\0';
7478 return 1;
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00007479
7480 onError:
7481 PyErr_SetString(PyExc_TypeError,
7482 "%c requires int or char");
7483 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007484}
7485
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007486/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
7487
7488 FORMATBUFLEN is the length of the buffer in which the floats, ints, &
7489 chars are formatted. XXX This is a magic number. Each formatting
7490 routine does bounds checking to ensure no overflow, but a better
7491 solution may be to malloc a buffer of appropriate size for each
7492 format. For now, the current solution is sufficient.
7493*/
7494#define FORMATBUFLEN (size_t)120
7495
Guido van Rossumd57fd912000-03-10 22:53:23 +00007496PyObject *PyUnicode_Format(PyObject *format,
7497 PyObject *args)
7498{
7499 Py_UNICODE *fmt, *res;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007500 Py_ssize_t fmtcnt, rescnt, reslen, arglen, argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007501 int args_owned = 0;
7502 PyUnicodeObject *result = NULL;
7503 PyObject *dict = NULL;
7504 PyObject *uformat;
Tim Petersced69f82003-09-16 20:30:58 +00007505
Guido van Rossumd57fd912000-03-10 22:53:23 +00007506 if (format == NULL || args == NULL) {
7507 PyErr_BadInternalCall();
7508 return NULL;
7509 }
7510 uformat = PyUnicode_FromObject(format);
Fred Drakee4315f52000-05-09 19:53:39 +00007511 if (uformat == NULL)
7512 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007513 fmt = PyUnicode_AS_UNICODE(uformat);
7514 fmtcnt = PyUnicode_GET_SIZE(uformat);
7515
7516 reslen = rescnt = fmtcnt + 100;
7517 result = _PyUnicode_New(reslen);
7518 if (result == NULL)
7519 goto onError;
7520 res = PyUnicode_AS_UNICODE(result);
7521
7522 if (PyTuple_Check(args)) {
7523 arglen = PyTuple_Size(args);
7524 argidx = 0;
7525 }
7526 else {
7527 arglen = -1;
7528 argidx = -2;
7529 }
Neal Norwitz80a1bf42002-11-12 23:01:12 +00007530 if (args->ob_type->tp_as_mapping && !PyTuple_Check(args) &&
7531 !PyObject_TypeCheck(args, &PyBaseString_Type))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007532 dict = args;
7533
7534 while (--fmtcnt >= 0) {
7535 if (*fmt != '%') {
7536 if (--rescnt < 0) {
7537 rescnt = fmtcnt + 100;
7538 reslen += rescnt;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00007539 if (_PyUnicode_Resize(&result, reslen) < 0)
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00007540 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007541 res = PyUnicode_AS_UNICODE(result) + reslen - rescnt;
7542 --rescnt;
7543 }
7544 *res++ = *fmt++;
7545 }
7546 else {
7547 /* Got a format specifier */
7548 int flags = 0;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007549 Py_ssize_t width = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007550 int prec = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007551 Py_UNICODE c = '\0';
7552 Py_UNICODE fill;
7553 PyObject *v = NULL;
7554 PyObject *temp = NULL;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007555 Py_UNICODE *pbuf;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007556 Py_UNICODE sign;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007557 Py_ssize_t len;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007558 Py_UNICODE formatbuf[FORMATBUFLEN]; /* For format{float,int,char}() */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007559
7560 fmt++;
7561 if (*fmt == '(') {
7562 Py_UNICODE *keystart;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007563 Py_ssize_t keylen;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007564 PyObject *key;
7565 int pcount = 1;
7566
7567 if (dict == NULL) {
7568 PyErr_SetString(PyExc_TypeError,
Tim Petersced69f82003-09-16 20:30:58 +00007569 "format requires a mapping");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007570 goto onError;
7571 }
7572 ++fmt;
7573 --fmtcnt;
7574 keystart = fmt;
7575 /* Skip over balanced parentheses */
7576 while (pcount > 0 && --fmtcnt >= 0) {
7577 if (*fmt == ')')
7578 --pcount;
7579 else if (*fmt == '(')
7580 ++pcount;
7581 fmt++;
7582 }
7583 keylen = fmt - keystart - 1;
7584 if (fmtcnt < 0 || pcount > 0) {
7585 PyErr_SetString(PyExc_ValueError,
7586 "incomplete format key");
7587 goto onError;
7588 }
Marc-André Lemburg72f82132001-11-20 15:18:49 +00007589#if 0
Fred Drakee4315f52000-05-09 19:53:39 +00007590 /* keys are converted to strings using UTF-8 and
Guido van Rossumd57fd912000-03-10 22:53:23 +00007591 then looked up since Python uses strings to hold
7592 variables names etc. in its namespaces and we
Fred Drakee4315f52000-05-09 19:53:39 +00007593 wouldn't want to break common idioms. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007594 key = PyUnicode_EncodeUTF8(keystart,
7595 keylen,
7596 NULL);
Marc-André Lemburg72f82132001-11-20 15:18:49 +00007597#else
7598 key = PyUnicode_FromUnicode(keystart, keylen);
7599#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00007600 if (key == NULL)
7601 goto onError;
7602 if (args_owned) {
7603 Py_DECREF(args);
7604 args_owned = 0;
7605 }
7606 args = PyObject_GetItem(dict, key);
7607 Py_DECREF(key);
7608 if (args == NULL) {
7609 goto onError;
7610 }
7611 args_owned = 1;
7612 arglen = -1;
7613 argidx = -2;
7614 }
7615 while (--fmtcnt >= 0) {
7616 switch (c = *fmt++) {
7617 case '-': flags |= F_LJUST; continue;
7618 case '+': flags |= F_SIGN; continue;
7619 case ' ': flags |= F_BLANK; continue;
7620 case '#': flags |= F_ALT; continue;
7621 case '0': flags |= F_ZERO; continue;
7622 }
7623 break;
7624 }
7625 if (c == '*') {
7626 v = getnextarg(args, arglen, &argidx);
7627 if (v == NULL)
7628 goto onError;
7629 if (!PyInt_Check(v)) {
7630 PyErr_SetString(PyExc_TypeError,
7631 "* wants int");
7632 goto onError;
7633 }
7634 width = PyInt_AsLong(v);
Guido van Rossumddefaf32007-01-14 03:31:43 +00007635 if (width == -1 && PyErr_Occurred())
7636 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007637 if (width < 0) {
7638 flags |= F_LJUST;
7639 width = -width;
7640 }
7641 if (--fmtcnt >= 0)
7642 c = *fmt++;
7643 }
7644 else if (c >= '0' && c <= '9') {
7645 width = c - '0';
7646 while (--fmtcnt >= 0) {
7647 c = *fmt++;
7648 if (c < '0' || c > '9')
7649 break;
7650 if ((width*10) / 10 != width) {
7651 PyErr_SetString(PyExc_ValueError,
7652 "width too big");
7653 goto onError;
7654 }
7655 width = width*10 + (c - '0');
7656 }
7657 }
7658 if (c == '.') {
7659 prec = 0;
7660 if (--fmtcnt >= 0)
7661 c = *fmt++;
7662 if (c == '*') {
7663 v = getnextarg(args, arglen, &argidx);
7664 if (v == NULL)
7665 goto onError;
7666 if (!PyInt_Check(v)) {
7667 PyErr_SetString(PyExc_TypeError,
7668 "* wants int");
7669 goto onError;
7670 }
7671 prec = PyInt_AsLong(v);
Guido van Rossumddefaf32007-01-14 03:31:43 +00007672 if (prec == -1 && PyErr_Occurred())
7673 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007674 if (prec < 0)
7675 prec = 0;
7676 if (--fmtcnt >= 0)
7677 c = *fmt++;
7678 }
7679 else if (c >= '0' && c <= '9') {
7680 prec = c - '0';
7681 while (--fmtcnt >= 0) {
7682 c = Py_CHARMASK(*fmt++);
7683 if (c < '0' || c > '9')
7684 break;
7685 if ((prec*10) / 10 != prec) {
7686 PyErr_SetString(PyExc_ValueError,
7687 "prec too big");
7688 goto onError;
7689 }
7690 prec = prec*10 + (c - '0');
7691 }
7692 }
7693 } /* prec */
7694 if (fmtcnt >= 0) {
7695 if (c == 'h' || c == 'l' || c == 'L') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007696 if (--fmtcnt >= 0)
7697 c = *fmt++;
7698 }
7699 }
7700 if (fmtcnt < 0) {
7701 PyErr_SetString(PyExc_ValueError,
7702 "incomplete format");
7703 goto onError;
7704 }
7705 if (c != '%') {
7706 v = getnextarg(args, arglen, &argidx);
7707 if (v == NULL)
7708 goto onError;
7709 }
7710 sign = 0;
7711 fill = ' ';
7712 switch (c) {
7713
7714 case '%':
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007715 pbuf = formatbuf;
7716 /* presume that buffer length is at least 1 */
7717 pbuf[0] = '%';
Guido van Rossumd57fd912000-03-10 22:53:23 +00007718 len = 1;
7719 break;
7720
7721 case 's':
7722 case 'r':
7723 if (PyUnicode_Check(v) && c == 's') {
7724 temp = v;
7725 Py_INCREF(temp);
7726 }
7727 else {
7728 PyObject *unicode;
7729 if (c == 's')
Marc-André Lemburgd25c6502004-07-23 16:13:25 +00007730 temp = PyObject_Unicode(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007731 else
7732 temp = PyObject_Repr(v);
7733 if (temp == NULL)
7734 goto onError;
Marc-André Lemburgd25c6502004-07-23 16:13:25 +00007735 if (PyUnicode_Check(temp))
7736 /* nothing to do */;
7737 else if (PyString_Check(temp)) {
7738 /* convert to string to Unicode */
Thomas Woutersa96affe2006-03-12 00:29:36 +00007739 unicode = PyUnicode_Decode(PyString_AS_STRING(temp),
Guido van Rossumd57fd912000-03-10 22:53:23 +00007740 PyString_GET_SIZE(temp),
Thomas Woutersa96affe2006-03-12 00:29:36 +00007741 NULL,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007742 "strict");
Thomas Woutersa96affe2006-03-12 00:29:36 +00007743 Py_DECREF(temp);
7744 temp = unicode;
7745 if (temp == NULL)
7746 goto onError;
7747 }
Marc-André Lemburgd25c6502004-07-23 16:13:25 +00007748 else {
7749 Py_DECREF(temp);
7750 PyErr_SetString(PyExc_TypeError,
7751 "%s argument has non-string str()");
7752 goto onError;
7753 }
7754 }
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007755 pbuf = PyUnicode_AS_UNICODE(temp);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007756 len = PyUnicode_GET_SIZE(temp);
7757 if (prec >= 0 && len > prec)
7758 len = prec;
7759 break;
7760
7761 case 'i':
7762 case 'd':
7763 case 'u':
7764 case 'o':
7765 case 'x':
7766 case 'X':
7767 if (c == 'i')
7768 c = 'd';
Tim Petersa3a3a032000-11-30 05:22:44 +00007769 if (PyLong_Check(v)) {
Tim Peters38fd5b62000-09-21 05:43:11 +00007770 temp = formatlong(v, flags, prec, c);
7771 if (!temp)
7772 goto onError;
7773 pbuf = PyUnicode_AS_UNICODE(temp);
7774 len = PyUnicode_GET_SIZE(temp);
Tim Peters38fd5b62000-09-21 05:43:11 +00007775 sign = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007776 }
Tim Peters38fd5b62000-09-21 05:43:11 +00007777 else {
7778 pbuf = formatbuf;
7779 len = formatint(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
7780 flags, prec, c, v);
7781 if (len < 0)
7782 goto onError;
Guido van Rossum6c9e1302003-11-29 23:52:13 +00007783 sign = 1;
Tim Peters38fd5b62000-09-21 05:43:11 +00007784 }
7785 if (flags & F_ZERO)
7786 fill = '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00007787 break;
7788
7789 case 'e':
7790 case 'E':
7791 case 'f':
Raymond Hettinger9bfe5332003-08-27 04:55:52 +00007792 case 'F':
Guido van Rossumd57fd912000-03-10 22:53:23 +00007793 case 'g':
7794 case 'G':
Raymond Hettinger9bfe5332003-08-27 04:55:52 +00007795 if (c == 'F')
7796 c = 'f';
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007797 pbuf = formatbuf;
7798 len = formatfloat(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
7799 flags, prec, c, v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007800 if (len < 0)
7801 goto onError;
7802 sign = 1;
Tim Peters38fd5b62000-09-21 05:43:11 +00007803 if (flags & F_ZERO)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007804 fill = '0';
7805 break;
7806
7807 case 'c':
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007808 pbuf = formatbuf;
7809 len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007810 if (len < 0)
7811 goto onError;
7812 break;
7813
7814 default:
7815 PyErr_Format(PyExc_ValueError,
Andrew M. Kuchling6ca89172000-12-15 13:07:46 +00007816 "unsupported format character '%c' (0x%x) "
Thomas Wouters89f507f2006-12-13 04:49:30 +00007817 "at index %zd",
Tim Petersced69f82003-09-16 20:30:58 +00007818 (31<=c && c<=126) ? (char)c : '?',
Marc-André Lemburg24e53b62002-09-24 09:32:14 +00007819 (int)c,
Thomas Wouters89f507f2006-12-13 04:49:30 +00007820 (Py_ssize_t)(fmt - 1 -
7821 PyUnicode_AS_UNICODE(uformat)));
Guido van Rossumd57fd912000-03-10 22:53:23 +00007822 goto onError;
7823 }
7824 if (sign) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007825 if (*pbuf == '-' || *pbuf == '+') {
7826 sign = *pbuf++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007827 len--;
7828 }
7829 else if (flags & F_SIGN)
7830 sign = '+';
7831 else if (flags & F_BLANK)
7832 sign = ' ';
7833 else
7834 sign = 0;
7835 }
7836 if (width < len)
7837 width = len;
Guido van Rossum049cd6b2002-10-11 00:43:48 +00007838 if (rescnt - (sign != 0) < width) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007839 reslen -= rescnt;
7840 rescnt = width + fmtcnt + 100;
7841 reslen += rescnt;
Guido van Rossum049cd6b2002-10-11 00:43:48 +00007842 if (reslen < 0) {
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00007843 Py_XDECREF(temp);
Thomas Woutersa96affe2006-03-12 00:29:36 +00007844 PyErr_NoMemory();
7845 goto onError;
Guido van Rossum049cd6b2002-10-11 00:43:48 +00007846 }
Thomas Woutersa96affe2006-03-12 00:29:36 +00007847 if (_PyUnicode_Resize(&result, reslen) < 0) {
7848 Py_XDECREF(temp);
7849 goto onError;
7850 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007851 res = PyUnicode_AS_UNICODE(result)
7852 + reslen - rescnt;
7853 }
7854 if (sign) {
7855 if (fill != ' ')
7856 *res++ = sign;
7857 rescnt--;
7858 if (width > len)
7859 width--;
7860 }
Tim Peters38fd5b62000-09-21 05:43:11 +00007861 if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
7862 assert(pbuf[0] == '0');
Tim Petersfff53252001-04-12 18:38:48 +00007863 assert(pbuf[1] == c);
7864 if (fill != ' ') {
7865 *res++ = *pbuf++;
7866 *res++ = *pbuf++;
Tim Peters38fd5b62000-09-21 05:43:11 +00007867 }
Tim Petersfff53252001-04-12 18:38:48 +00007868 rescnt -= 2;
7869 width -= 2;
7870 if (width < 0)
7871 width = 0;
7872 len -= 2;
Tim Peters38fd5b62000-09-21 05:43:11 +00007873 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007874 if (width > len && !(flags & F_LJUST)) {
7875 do {
7876 --rescnt;
7877 *res++ = fill;
7878 } while (--width > len);
7879 }
Tim Peters38fd5b62000-09-21 05:43:11 +00007880 if (fill == ' ') {
7881 if (sign)
7882 *res++ = sign;
Tim Petersfff53252001-04-12 18:38:48 +00007883 if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
Tim Peters38fd5b62000-09-21 05:43:11 +00007884 assert(pbuf[0] == '0');
Tim Petersfff53252001-04-12 18:38:48 +00007885 assert(pbuf[1] == c);
Tim Peters38fd5b62000-09-21 05:43:11 +00007886 *res++ = *pbuf++;
7887 *res++ = *pbuf++;
7888 }
7889 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00007890 Py_UNICODE_COPY(res, pbuf, len);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007891 res += len;
7892 rescnt -= len;
7893 while (--width >= len) {
7894 --rescnt;
7895 *res++ = ' ';
7896 }
7897 if (dict && (argidx < arglen) && c != '%') {
7898 PyErr_SetString(PyExc_TypeError,
Raymond Hettinger0ebac972002-05-21 15:14:57 +00007899 "not all arguments converted during string formatting");
Thomas Woutersa96affe2006-03-12 00:29:36 +00007900 Py_XDECREF(temp);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007901 goto onError;
7902 }
7903 Py_XDECREF(temp);
7904 } /* '%' */
7905 } /* until end */
7906 if (argidx < arglen && !dict) {
7907 PyErr_SetString(PyExc_TypeError,
Raymond Hettinger0ebac972002-05-21 15:14:57 +00007908 "not all arguments converted during string formatting");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007909 goto onError;
7910 }
7911
Thomas Woutersa96affe2006-03-12 00:29:36 +00007912 if (_PyUnicode_Resize(&result, reslen - rescnt) < 0)
7913 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007914 if (args_owned) {
7915 Py_DECREF(args);
7916 }
7917 Py_DECREF(uformat);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007918 return (PyObject *)result;
7919
7920 onError:
7921 Py_XDECREF(result);
7922 Py_DECREF(uformat);
7923 if (args_owned) {
7924 Py_DECREF(args);
7925 }
7926 return NULL;
7927}
7928
7929static PyBufferProcs unicode_as_buffer = {
Martin v. Löwis18e16552006-02-15 17:27:45 +00007930 (readbufferproc) unicode_buffer_getreadbuf,
7931 (writebufferproc) unicode_buffer_getwritebuf,
7932 (segcountproc) unicode_buffer_getsegcount,
7933 (charbufferproc) unicode_buffer_getcharbuf,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007934};
7935
Jeremy Hylton938ace62002-07-17 16:30:39 +00007936static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +00007937unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
7938
Tim Peters6d6c1a32001-08-02 04:15:00 +00007939static PyObject *
7940unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
7941{
7942 PyObject *x = NULL;
Martin v. Löwis15e62742006-02-27 16:46:16 +00007943 static char *kwlist[] = {"string", "encoding", "errors", 0};
Tim Peters6d6c1a32001-08-02 04:15:00 +00007944 char *encoding = NULL;
7945 char *errors = NULL;
7946
Guido van Rossume023fe02001-08-30 03:12:59 +00007947 if (type != &PyUnicode_Type)
7948 return unicode_subtype_new(type, args, kwds);
Tim Peters6d6c1a32001-08-02 04:15:00 +00007949 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:unicode",
7950 kwlist, &x, &encoding, &errors))
7951 return NULL;
7952 if (x == NULL)
7953 return (PyObject *)_PyUnicode_New(0);
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00007954 if (encoding == NULL && errors == NULL)
7955 return PyObject_Unicode(x);
7956 else
Tim Peters6d6c1a32001-08-02 04:15:00 +00007957 return PyUnicode_FromEncodedObject(x, encoding, errors);
7958}
7959
Guido van Rossume023fe02001-08-30 03:12:59 +00007960static PyObject *
7961unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
7962{
Tim Petersaf90b3e2001-09-12 05:18:58 +00007963 PyUnicodeObject *tmp, *pnew;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007964 Py_ssize_t n;
Guido van Rossume023fe02001-08-30 03:12:59 +00007965
7966 assert(PyType_IsSubtype(type, &PyUnicode_Type));
7967 tmp = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds);
7968 if (tmp == NULL)
7969 return NULL;
7970 assert(PyUnicode_Check(tmp));
Tim Petersaf90b3e2001-09-12 05:18:58 +00007971 pnew = (PyUnicodeObject *) type->tp_alloc(type, n = tmp->length);
Raymond Hettingerf4667932003-06-28 20:04:25 +00007972 if (pnew == NULL) {
7973 Py_DECREF(tmp);
Guido van Rossume023fe02001-08-30 03:12:59 +00007974 return NULL;
Raymond Hettingerf4667932003-06-28 20:04:25 +00007975 }
Tim Petersaf90b3e2001-09-12 05:18:58 +00007976 pnew->str = PyMem_NEW(Py_UNICODE, n+1);
7977 if (pnew->str == NULL) {
7978 _Py_ForgetReference((PyObject *)pnew);
Neil Schemenauer58aa8612002-04-12 03:07:20 +00007979 PyObject_Del(pnew);
Raymond Hettingerf4667932003-06-28 20:04:25 +00007980 Py_DECREF(tmp);
Neal Norwitzec74f2f2003-02-11 23:05:40 +00007981 return PyErr_NoMemory();
Guido van Rossume023fe02001-08-30 03:12:59 +00007982 }
Tim Petersaf90b3e2001-09-12 05:18:58 +00007983 Py_UNICODE_COPY(pnew->str, tmp->str, n+1);
7984 pnew->length = n;
7985 pnew->hash = tmp->hash;
Guido van Rossume023fe02001-08-30 03:12:59 +00007986 Py_DECREF(tmp);
Tim Petersaf90b3e2001-09-12 05:18:58 +00007987 return (PyObject *)pnew;
Guido van Rossume023fe02001-08-30 03:12:59 +00007988}
7989
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007990PyDoc_STRVAR(unicode_doc,
Tim Peters6d6c1a32001-08-02 04:15:00 +00007991"unicode(string [, encoding[, errors]]) -> object\n\
7992\n\
7993Create a new Unicode object from the given encoded string.\n\
Skip Montanaro35b37a52002-07-26 16:22:46 +00007994encoding defaults to the current default string encoding.\n\
7995errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +00007996
Guido van Rossum50e9fb92006-08-17 05:42:55 +00007997static PyObject *unicode_iter(PyObject *seq);
7998
Guido van Rossumd57fd912000-03-10 22:53:23 +00007999PyTypeObject PyUnicode_Type = {
8000 PyObject_HEAD_INIT(&PyType_Type)
8001 0, /* ob_size */
Guido van Rossum84fc66d2007-05-03 17:18:26 +00008002 "str", /* tp_name */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008003 sizeof(PyUnicodeObject), /* tp_size */
8004 0, /* tp_itemsize */
8005 /* Slots */
Guido van Rossum9475a232001-10-05 20:51:39 +00008006 (destructor)unicode_dealloc, /* tp_dealloc */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008007 0, /* tp_print */
Tim Peters6d6c1a32001-08-02 04:15:00 +00008008 0, /* tp_getattr */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008009 0, /* tp_setattr */
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00008010 0, /* tp_compare */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008011 unicode_repr, /* tp_repr */
Neil Schemenauerce30bc92002-11-18 16:10:18 +00008012 &unicode_as_number, /* tp_as_number */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008013 &unicode_as_sequence, /* tp_as_sequence */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008014 &unicode_as_mapping, /* tp_as_mapping */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008015 (hashfunc) unicode_hash, /* tp_hash*/
8016 0, /* tp_call*/
8017 (reprfunc) unicode_str, /* tp_str */
Tim Peters6d6c1a32001-08-02 04:15:00 +00008018 PyObject_GenericGetAttr, /* tp_getattro */
8019 0, /* tp_setattro */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008020 &unicode_as_buffer, /* tp_as_buffer */
Thomas Wouters27d517b2007-02-25 20:39:11 +00008021 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
8022 Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
Tim Peters6d6c1a32001-08-02 04:15:00 +00008023 unicode_doc, /* tp_doc */
8024 0, /* tp_traverse */
8025 0, /* tp_clear */
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00008026 PyUnicode_RichCompare, /* tp_richcompare */
Tim Peters6d6c1a32001-08-02 04:15:00 +00008027 0, /* tp_weaklistoffset */
Guido van Rossum50e9fb92006-08-17 05:42:55 +00008028 unicode_iter, /* tp_iter */
Tim Peters6d6c1a32001-08-02 04:15:00 +00008029 0, /* tp_iternext */
8030 unicode_methods, /* tp_methods */
8031 0, /* tp_members */
8032 0, /* tp_getset */
Guido van Rossumcacfc072002-05-24 19:01:59 +00008033 &PyBaseString_Type, /* tp_base */
Tim Peters6d6c1a32001-08-02 04:15:00 +00008034 0, /* tp_dict */
8035 0, /* tp_descr_get */
8036 0, /* tp_descr_set */
8037 0, /* tp_dictoffset */
8038 0, /* tp_init */
8039 0, /* tp_alloc */
8040 unicode_new, /* tp_new */
Neil Schemenauer58aa8612002-04-12 03:07:20 +00008041 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008042};
8043
8044/* Initialize the Unicode implementation */
8045
Thomas Wouters78890102000-07-22 19:25:51 +00008046void _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008047{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008048 int i;
8049
Thomas Wouters477c8d52006-05-27 19:21:47 +00008050 /* XXX - move this array to unicodectype.c ? */
8051 Py_UNICODE linebreak[] = {
8052 0x000A, /* LINE FEED */
8053 0x000D, /* CARRIAGE RETURN */
8054 0x001C, /* FILE SEPARATOR */
8055 0x001D, /* GROUP SEPARATOR */
8056 0x001E, /* RECORD SEPARATOR */
8057 0x0085, /* NEXT LINE */
8058 0x2028, /* LINE SEPARATOR */
8059 0x2029, /* PARAGRAPH SEPARATOR */
8060 };
8061
Fred Drakee4315f52000-05-09 19:53:39 +00008062 /* Init the implementation */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008063 unicode_freelist = NULL;
8064 unicode_freelist_size = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008065 unicode_empty = _PyUnicode_New(0);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008066 if (!unicode_empty)
8067 return;
8068
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008069 for (i = 0; i < 256; i++)
8070 unicode_latin1[i] = NULL;
Guido van Rossumcacfc072002-05-24 19:01:59 +00008071 if (PyType_Ready(&PyUnicode_Type) < 0)
8072 Py_FatalError("Can't initialize 'unicode'");
Thomas Wouters477c8d52006-05-27 19:21:47 +00008073
8074 /* initialize the linebreak bloom filter */
8075 bloom_linebreak = make_bloom_mask(
8076 linebreak, sizeof(linebreak) / sizeof(linebreak[0])
8077 );
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008078
8079 PyType_Ready(&EncodingMapType);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008080}
8081
8082/* Finalize the Unicode implementation */
8083
8084void
Thomas Wouters78890102000-07-22 19:25:51 +00008085_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008086{
Barry Warsaw5b4c2282000-10-03 20:45:26 +00008087 PyUnicodeObject *u;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008088 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008089
Guido van Rossum4ae8ef82000-10-03 18:09:04 +00008090 Py_XDECREF(unicode_empty);
8091 unicode_empty = NULL;
Barry Warsaw5b4c2282000-10-03 20:45:26 +00008092
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008093 for (i = 0; i < 256; i++) {
8094 if (unicode_latin1[i]) {
8095 Py_DECREF(unicode_latin1[i]);
8096 unicode_latin1[i] = NULL;
8097 }
8098 }
8099
Barry Warsaw5b4c2282000-10-03 20:45:26 +00008100 for (u = unicode_freelist; u != NULL;) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00008101 PyUnicodeObject *v = u;
8102 u = *(PyUnicodeObject **)u;
Guido van Rossumfd4b9572000-04-10 13:51:10 +00008103 if (v->str)
Guido van Rossumb18618d2000-05-03 23:44:39 +00008104 PyMem_DEL(v->str);
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00008105 Py_XDECREF(v->defenc);
Neil Schemenauer58aa8612002-04-12 03:07:20 +00008106 PyObject_Del(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008107 }
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008108 unicode_freelist = NULL;
8109 unicode_freelist_size = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008110}
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00008111
Guido van Rossum50e9fb92006-08-17 05:42:55 +00008112
8113
8114/********************* Unicode Iterator **************************/
8115
8116typedef struct {
8117 PyObject_HEAD
Guido van Rossum49d6b072006-08-17 21:11:47 +00008118 Py_ssize_t it_index;
Guido van Rossum50e9fb92006-08-17 05:42:55 +00008119 PyUnicodeObject *it_seq; /* Set to NULL when iterator is exhausted */
8120} unicodeiterobject;
8121
8122static void
8123unicodeiter_dealloc(unicodeiterobject *it)
8124{
8125 _PyObject_GC_UNTRACK(it);
8126 Py_XDECREF(it->it_seq);
8127 PyObject_GC_Del(it);
8128}
8129
8130static int
8131unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
8132{
8133 Py_VISIT(it->it_seq);
8134 return 0;
8135}
8136
8137static PyObject *
8138unicodeiter_next(unicodeiterobject *it)
8139{
8140 PyUnicodeObject *seq;
8141 PyObject *item;
8142
8143 assert(it != NULL);
8144 seq = it->it_seq;
8145 if (seq == NULL)
8146 return NULL;
8147 assert(PyUnicode_Check(seq));
8148
8149 if (it->it_index < PyUnicode_GET_SIZE(seq)) {
Guido van Rossum49d6b072006-08-17 21:11:47 +00008150 item = PyUnicode_FromUnicode(
8151 PyUnicode_AS_UNICODE(seq)+it->it_index, 1);
Guido van Rossum50e9fb92006-08-17 05:42:55 +00008152 if (item != NULL)
8153 ++it->it_index;
8154 return item;
8155 }
8156
8157 Py_DECREF(seq);
8158 it->it_seq = NULL;
8159 return NULL;
8160}
8161
8162static PyObject *
8163unicodeiter_len(unicodeiterobject *it)
8164{
8165 Py_ssize_t len = 0;
8166 if (it->it_seq)
8167 len = PyUnicode_GET_SIZE(it->it_seq) - it->it_index;
8168 return PyInt_FromSsize_t(len);
8169}
8170
8171PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
8172
8173static PyMethodDef unicodeiter_methods[] = {
Guido van Rossum49d6b072006-08-17 21:11:47 +00008174 {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
8175 length_hint_doc},
Guido van Rossum50e9fb92006-08-17 05:42:55 +00008176 {NULL, NULL} /* sentinel */
8177};
8178
8179PyTypeObject PyUnicodeIter_Type = {
8180 PyObject_HEAD_INIT(&PyType_Type)
8181 0, /* ob_size */
8182 "unicodeiterator", /* tp_name */
8183 sizeof(unicodeiterobject), /* tp_basicsize */
8184 0, /* tp_itemsize */
8185 /* methods */
Guido van Rossum49d6b072006-08-17 21:11:47 +00008186 (destructor)unicodeiter_dealloc, /* tp_dealloc */
Guido van Rossum50e9fb92006-08-17 05:42:55 +00008187 0, /* tp_print */
8188 0, /* tp_getattr */
8189 0, /* tp_setattr */
8190 0, /* tp_compare */
8191 0, /* tp_repr */
8192 0, /* tp_as_number */
8193 0, /* tp_as_sequence */
8194 0, /* tp_as_mapping */
8195 0, /* tp_hash */
8196 0, /* tp_call */
8197 0, /* tp_str */
8198 PyObject_GenericGetAttr, /* tp_getattro */
8199 0, /* tp_setattro */
8200 0, /* tp_as_buffer */
8201 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
8202 0, /* tp_doc */
8203 (traverseproc)unicodeiter_traverse, /* tp_traverse */
8204 0, /* tp_clear */
8205 0, /* tp_richcompare */
8206 0, /* tp_weaklistoffset */
8207 PyObject_SelfIter, /* tp_iter */
8208 (iternextfunc)unicodeiter_next, /* tp_iternext */
8209 unicodeiter_methods, /* tp_methods */
8210 0,
8211};
8212
8213static PyObject *
8214unicode_iter(PyObject *seq)
8215{
8216 unicodeiterobject *it;
8217
8218 if (!PyUnicode_Check(seq)) {
8219 PyErr_BadInternalCall();
8220 return NULL;
8221 }
8222 it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
8223 if (it == NULL)
8224 return NULL;
8225 it->it_index = 0;
8226 Py_INCREF(seq);
8227 it->it_seq = (PyUnicodeObject *)seq;
8228 _PyObject_GC_TRACK(it);
8229 return (PyObject *)it;
8230}
8231
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008232#ifdef __cplusplus
8233}
8234#endif
8235
8236
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00008237/*
8238Local variables:
8239c-basic-offset: 4
8240indent-tabs-mode: nil
8241End:
8242*/