blob: 9dc96da73c7f97c1d01a55bbaf878f866edd760f [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Fred Drake785d14f2000-05-09 19:54:43 +00004modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
Guido van Rossumd57fd912000-03-10 22:53:23 +00005Unicode Integration Proposal (see file Misc/unicode.txt).
6
Thomas Wouters477c8d52006-05-27 19:21:47 +00007Major speed upgrades to the method implementations at the Reykjavik
8NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
9
Guido van Rossum16b1ad92000-08-03 16:24:25 +000010Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000011
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000012--------------------------------------------------------------------
13The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000014
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000015 Copyright (c) 1999 by Secret Labs AB
16 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000017
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000018By obtaining, using, and/or copying this software and/or its
19associated documentation, you agree that you have read, understood,
20and will comply with the following terms and conditions:
21
22Permission to use, copy, modify, and distribute this software and its
23associated documentation for any purpose and without fee is hereby
24granted, provided that the above copyright notice appears in all
25copies, and that both that copyright notice and this permission notice
26appear in supporting documentation, and that the name of Secret Labs
27AB or the author not be used in advertising or publicity pertaining to
28distribution of the software without specific, written prior
29permission.
30
31SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
32THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
33FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
34ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
35WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
36ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
37OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
38--------------------------------------------------------------------
39
40*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000041
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000042#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000043#include "Python.h"
44
Guido van Rossumd57fd912000-03-10 22:53:23 +000045#include "unicodeobject.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000046#include "ucnhash.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000047
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000048#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000049#include <windows.h>
50#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000051
Guido van Rossumd57fd912000-03-10 22:53:23 +000052/* Limit for the Unicode object free list */
53
54#define MAX_UNICODE_FREELIST_SIZE 1024
55
56/* Limit for the Unicode object free list stay alive optimization.
57
58 The implementation will keep allocated Unicode memory intact for
59 all objects on the free list having a size less than this
Tim Petersced69f82003-09-16 20:30:58 +000060 limit. This reduces malloc() overhead for small Unicode objects.
Guido van Rossumd57fd912000-03-10 22:53:23 +000061
Barry Warsaw51ac5802000-03-20 16:36:48 +000062 At worst this will result in MAX_UNICODE_FREELIST_SIZE *
Guido van Rossumfd4b9572000-04-10 13:51:10 +000063 (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
Guido van Rossumd57fd912000-03-10 22:53:23 +000064 malloc()-overhead) bytes of unused garbage.
65
66 Setting the limit to 0 effectively turns the feature off.
67
Guido van Rossumfd4b9572000-04-10 13:51:10 +000068 Note: This is an experimental feature ! If you get core dumps when
69 using Unicode objects, turn this feature off.
Guido van Rossumd57fd912000-03-10 22:53:23 +000070
71*/
72
Guido van Rossumfd4b9572000-04-10 13:51:10 +000073#define KEEPALIVE_SIZE_LIMIT 9
Guido van Rossumd57fd912000-03-10 22:53:23 +000074
75/* Endianness switches; defaults to little endian */
76
77#ifdef WORDS_BIGENDIAN
78# define BYTEORDER_IS_BIG_ENDIAN
79#else
80# define BYTEORDER_IS_LITTLE_ENDIAN
81#endif
82
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000083/* --- Globals ------------------------------------------------------------
84
85 The globals are initialized by the _PyUnicode_Init() API and should
86 not be used before calling that API.
87
88*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000089
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000090
91#ifdef __cplusplus
92extern "C" {
93#endif
94
Guido van Rossumd57fd912000-03-10 22:53:23 +000095/* Free list for Unicode objects */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000096static PyUnicodeObject *unicode_freelist;
97static int unicode_freelist_size;
Guido van Rossumd57fd912000-03-10 22:53:23 +000098
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000099/* The empty Unicode object is shared to improve performance. */
100static PyUnicodeObject *unicode_empty;
101
102/* Single character Unicode strings in the Latin-1 range are being
103 shared as well. */
104static PyUnicodeObject *unicode_latin1[256];
105
Fred Drakee4315f52000-05-09 19:53:39 +0000106/* Default encoding to use and assume when NULL is passed as encoding
Guido van Rossumf15a29f2007-05-04 00:41:39 +0000107 parameter; it is fixed to "utf-8". Always use the
108 PyUnicode_GetDefaultEncoding() API to access this global. */
109static const char unicode_default_encoding[] = "utf-8";
Fred Drakee4315f52000-05-09 19:53:39 +0000110
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000111Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000112PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000113{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000114#ifdef Py_UNICODE_WIDE
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000115 return 0x10FFFF;
116#else
117 /* This is actually an illegal character, so it should
118 not be passed to unichr. */
119 return 0xFFFF;
120#endif
121}
122
Thomas Wouters477c8d52006-05-27 19:21:47 +0000123/* --- Bloom Filters ----------------------------------------------------- */
124
125/* stuff to implement simple "bloom filters" for Unicode characters.
126 to keep things simple, we use a single bitmask, using the least 5
127 bits from each unicode characters as the bit index. */
128
129/* the linebreak mask is set up by Unicode_Init below */
130
131#define BLOOM_MASK unsigned long
132
133static BLOOM_MASK bloom_linebreak;
134
135#define BLOOM(mask, ch) ((mask & (1 << ((ch) & 0x1F))))
136
137#define BLOOM_LINEBREAK(ch)\
138 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK((ch)))
139
140Py_LOCAL_INLINE(BLOOM_MASK) make_bloom_mask(Py_UNICODE* ptr, Py_ssize_t len)
141{
142 /* calculate simple bloom-style bitmask for a given unicode string */
143
144 long mask;
145 Py_ssize_t i;
146
147 mask = 0;
148 for (i = 0; i < len; i++)
149 mask |= (1 << (ptr[i] & 0x1F));
150
151 return mask;
152}
153
154Py_LOCAL_INLINE(int) unicode_member(Py_UNICODE chr, Py_UNICODE* set, Py_ssize_t setlen)
155{
156 Py_ssize_t i;
157
158 for (i = 0; i < setlen; i++)
159 if (set[i] == chr)
160 return 1;
161
162 return 0;
163}
164
165#define BLOOM_MEMBER(mask, chr, set, setlen)\
166 BLOOM(mask, chr) && unicode_member(chr, set, setlen)
167
Guido van Rossumd57fd912000-03-10 22:53:23 +0000168/* --- Unicode Object ----------------------------------------------------- */
169
170static
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000171int unicode_resize(register PyUnicodeObject *unicode,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000172 Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000173{
174 void *oldstr;
Tim Petersced69f82003-09-16 20:30:58 +0000175
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000176 /* Shortcut if there's nothing much to do. */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000177 if (unicode->length == length)
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000178 goto reset;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000179
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000180 /* Resizing shared object (unicode_empty or single character
181 objects) in-place is not allowed. Use PyUnicode_Resize()
182 instead ! */
Thomas Wouters477c8d52006-05-27 19:21:47 +0000183
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000184 if (unicode == unicode_empty ||
185 (unicode->length == 1 &&
186 unicode->str[0] < 256U &&
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000187 unicode_latin1[unicode->str[0]] == unicode)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000188 PyErr_SetString(PyExc_SystemError,
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000189 "can't resize shared unicode objects");
Guido van Rossumd57fd912000-03-10 22:53:23 +0000190 return -1;
191 }
192
Thomas Wouters477c8d52006-05-27 19:21:47 +0000193 /* We allocate one more byte to make sure the string is Ux0000 terminated.
194 The overallocation is also used by fastsearch, which assumes that it's
195 safe to look at str[length] (without making any assumptions about what
196 it contains). */
197
Guido van Rossumd57fd912000-03-10 22:53:23 +0000198 oldstr = unicode->str;
199 PyMem_RESIZE(unicode->str, Py_UNICODE, length + 1);
200 if (!unicode->str) {
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000201 unicode->str = (Py_UNICODE *)oldstr;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000202 PyErr_NoMemory();
203 return -1;
204 }
205 unicode->str[length] = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000206 unicode->length = length;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000207
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000208 reset:
Guido van Rossumd57fd912000-03-10 22:53:23 +0000209 /* Reset the object caches */
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000210 if (unicode->defenc) {
211 Py_DECREF(unicode->defenc);
212 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000213 }
214 unicode->hash = -1;
Tim Petersced69f82003-09-16 20:30:58 +0000215
Guido van Rossumd57fd912000-03-10 22:53:23 +0000216 return 0;
217}
218
219/* We allocate one more byte to make sure the string is
Tim Petersced69f82003-09-16 20:30:58 +0000220 Ux0000 terminated -- XXX is this needed ?
Guido van Rossumd57fd912000-03-10 22:53:23 +0000221
222 XXX This allocator could further be enhanced by assuring that the
223 free list never reduces its size below 1.
224
225*/
226
227static
Martin v. Löwis18e16552006-02-15 17:27:45 +0000228PyUnicodeObject *_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000229{
230 register PyUnicodeObject *unicode;
231
Thomas Wouters477c8d52006-05-27 19:21:47 +0000232 /* Optimization for empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000233 if (length == 0 && unicode_empty != NULL) {
234 Py_INCREF(unicode_empty);
235 return unicode_empty;
236 }
237
238 /* Unicode freelist & memory allocation */
239 if (unicode_freelist) {
240 unicode = unicode_freelist;
Marc-André Lemburgbea47e72000-06-17 20:31:17 +0000241 unicode_freelist = *(PyUnicodeObject **)unicode;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000242 unicode_freelist_size--;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000243 if (unicode->str) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000244 /* Keep-Alive optimization: we only upsize the buffer,
245 never downsize it. */
Tim Petersced69f82003-09-16 20:30:58 +0000246 if ((unicode->length < length) &&
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +0000247 unicode_resize(unicode, length) < 0) {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000248 PyMem_DEL(unicode->str);
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000249 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000250 }
251 }
Guido van Rossumad98db12001-06-14 17:52:02 +0000252 else {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000253 unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
Guido van Rossumad98db12001-06-14 17:52:02 +0000254 }
255 PyObject_INIT(unicode, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000256 }
257 else {
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000258 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000259 if (unicode == NULL)
260 return NULL;
261 unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
262 }
263
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000264 if (!unicode->str) {
265 PyErr_NoMemory();
Barry Warsaw51ac5802000-03-20 16:36:48 +0000266 goto onError;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000267 }
Jeremy Hyltond8082792003-09-16 19:41:39 +0000268 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +0000269 * the caller fails before initializing str -- unicode_resize()
270 * reads str[0], and the Keep-Alive optimization can keep memory
271 * allocated for str alive across a call to unicode_dealloc(unicode).
272 * We don't want unicode_resize to read uninitialized memory in
273 * that case.
274 */
Jeremy Hyltond8082792003-09-16 19:41:39 +0000275 unicode->str[0] = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000276 unicode->str[length] = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000277 unicode->length = length;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000278 unicode->hash = -1;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000279 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000280 return unicode;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000281
282 onError:
283 _Py_ForgetReference((PyObject *)unicode);
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000284 PyObject_Del(unicode);
Barry Warsaw51ac5802000-03-20 16:36:48 +0000285 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000286}
287
288static
Guido van Rossum9475a232001-10-05 20:51:39 +0000289void unicode_dealloc(register PyUnicodeObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000290{
Guido van Rossum604ddf82001-12-06 20:03:56 +0000291 if (PyUnicode_CheckExact(unicode) &&
292 unicode_freelist_size < MAX_UNICODE_FREELIST_SIZE) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000293 /* Keep-Alive optimization */
294 if (unicode->length >= KEEPALIVE_SIZE_LIMIT) {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000295 PyMem_DEL(unicode->str);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000296 unicode->str = NULL;
297 unicode->length = 0;
298 }
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000299 if (unicode->defenc) {
300 Py_DECREF(unicode->defenc);
301 unicode->defenc = NULL;
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000302 }
303 /* Add to free list */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000304 *(PyUnicodeObject **)unicode = unicode_freelist;
305 unicode_freelist = unicode;
306 unicode_freelist_size++;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000307 }
308 else {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000309 PyMem_DEL(unicode->str);
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000310 Py_XDECREF(unicode->defenc);
Guido van Rossum604ddf82001-12-06 20:03:56 +0000311 unicode->ob_type->tp_free((PyObject *)unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000312 }
313}
314
Martin v. Löwis18e16552006-02-15 17:27:45 +0000315int PyUnicode_Resize(PyObject **unicode, Py_ssize_t length)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000316{
317 register PyUnicodeObject *v;
318
319 /* Argument checks */
320 if (unicode == NULL) {
321 PyErr_BadInternalCall();
322 return -1;
323 }
324 v = (PyUnicodeObject *)*unicode;
Guido van Rossum049cd6b2002-10-11 00:43:48 +0000325 if (v == NULL || !PyUnicode_Check(v) || v->ob_refcnt != 1 || length < 0) {
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000326 PyErr_BadInternalCall();
327 return -1;
328 }
329
330 /* Resizing unicode_empty and single character objects is not
331 possible since these are being shared. We simply return a fresh
332 copy with the same Unicode content. */
Tim Petersced69f82003-09-16 20:30:58 +0000333 if (v->length != length &&
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000334 (v == unicode_empty || v->length == 1)) {
335 PyUnicodeObject *w = _PyUnicode_New(length);
336 if (w == NULL)
337 return -1;
338 Py_UNICODE_COPY(w->str, v->str,
339 length < v->length ? length : v->length);
Raymond Hettingerc8df5782003-03-09 07:30:43 +0000340 Py_DECREF(*unicode);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000341 *unicode = (PyObject *)w;
342 return 0;
343 }
344
345 /* Note that we don't have to modify *unicode for unshared Unicode
346 objects, since we can modify them in-place. */
347 return unicode_resize(v, length);
348}
349
350/* Internal API for use in unicodeobject.c only ! */
351#define _PyUnicode_Resize(unicodevar, length) \
352 PyUnicode_Resize(((PyObject **)(unicodevar)), length)
353
Guido van Rossumd57fd912000-03-10 22:53:23 +0000354PyObject *PyUnicode_FromUnicode(const Py_UNICODE *u,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000355 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000356{
357 PyUnicodeObject *unicode;
358
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000359 /* If the Unicode data is known at construction time, we can apply
360 some optimizations which share commonly used objects. */
361 if (u != NULL) {
362
363 /* Optimization for empty strings */
364 if (size == 0 && unicode_empty != NULL) {
365 Py_INCREF(unicode_empty);
366 return (PyObject *)unicode_empty;
367 }
368
369 /* Single character Unicode objects in the Latin-1 range are
370 shared when using this constructor */
371 if (size == 1 && *u < 256) {
372 unicode = unicode_latin1[*u];
373 if (!unicode) {
374 unicode = _PyUnicode_New(1);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000375 if (!unicode)
376 return NULL;
Marc-André Lemburg8879a332001-06-07 12:26:56 +0000377 unicode->str[0] = *u;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000378 unicode_latin1[*u] = unicode;
379 }
380 Py_INCREF(unicode);
381 return (PyObject *)unicode;
382 }
383 }
Tim Petersced69f82003-09-16 20:30:58 +0000384
Guido van Rossumd57fd912000-03-10 22:53:23 +0000385 unicode = _PyUnicode_New(size);
386 if (!unicode)
387 return NULL;
388
389 /* Copy the Unicode data into the new object */
390 if (u != NULL)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000391 Py_UNICODE_COPY(unicode->str, u, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000392
393 return (PyObject *)unicode;
394}
395
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000396PyObject *PyUnicode_FromString(const char *u)
397{
398 PyUnicodeObject *unicode;
Walter Dörwalda14c4bb2007-05-06 10:00:02 +0000399 size_t size = strlen(u);
400 if (size > PY_SSIZE_T_MAX) {
401 PyErr_SetString(PyExc_OverflowError, "input too long");
402 return NULL;
403 }
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000404
405 /* If the Unicode data is known at construction time, we can apply
406 some optimizations which share commonly used objects. */
407 if (u != NULL) {
408
409 /* Optimization for empty strings */
410 if (size == 0 && unicode_empty != NULL) {
411 Py_INCREF(unicode_empty);
412 return (PyObject *)unicode_empty;
413 }
414
Walter Dörwald071b9da2007-05-05 14:21:20 +0000415 /* Single characters are shared when using this constructor */
416 if (size == 1) {
Walter Dörwaldce32db32007-05-05 14:26:59 +0000417 unicode = unicode_latin1[(int)*u];
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000418 if (!unicode) {
419 unicode = _PyUnicode_New(1);
420 if (!unicode)
421 return NULL;
422 unicode->str[0] = *u;
Walter Dörwaldce32db32007-05-05 14:26:59 +0000423 unicode_latin1[(int)*u] = unicode;
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000424 }
425 Py_INCREF(unicode);
426 return (PyObject *)unicode;
427 }
428 }
429
430 unicode = _PyUnicode_New(size);
431 if (!unicode)
432 return NULL;
433
434 /* Copy the Unicode data into the new object */
435 if (u != NULL) {
Walter Dörwaldce32db32007-05-05 14:26:59 +0000436 Py_UNICODE *p = unicode->str;
437 while ((*p++ = *u++))
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000438 ;
439 }
440
441 return (PyObject *)unicode;
442}
443
Guido van Rossumd57fd912000-03-10 22:53:23 +0000444#ifdef HAVE_WCHAR_H
445
446PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000447 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000448{
449 PyUnicodeObject *unicode;
450
451 if (w == NULL) {
452 PyErr_BadInternalCall();
453 return NULL;
454 }
455
456 unicode = _PyUnicode_New(size);
457 if (!unicode)
458 return NULL;
459
460 /* Copy the wchar_t data into the new object */
461#ifdef HAVE_USABLE_WCHAR_T
462 memcpy(unicode->str, w, size * sizeof(wchar_t));
Tim Petersced69f82003-09-16 20:30:58 +0000463#else
Guido van Rossumd57fd912000-03-10 22:53:23 +0000464 {
465 register Py_UNICODE *u;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000466 register Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000467 u = PyUnicode_AS_UNICODE(unicode);
Marc-André Lemburg204bd6d2004-10-15 07:45:05 +0000468 for (i = size; i > 0; i--)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000469 *u++ = *w++;
470 }
471#endif
472
473 return (PyObject *)unicode;
474}
475
Martin v. Löwis18e16552006-02-15 17:27:45 +0000476Py_ssize_t PyUnicode_AsWideChar(PyUnicodeObject *unicode,
477 wchar_t *w,
478 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000479{
480 if (unicode == NULL) {
481 PyErr_BadInternalCall();
482 return -1;
483 }
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +0000484
485 /* If possible, try to copy the 0-termination as well */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000486 if (size > PyUnicode_GET_SIZE(unicode))
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +0000487 size = PyUnicode_GET_SIZE(unicode) + 1;
488
Guido van Rossumd57fd912000-03-10 22:53:23 +0000489#ifdef HAVE_USABLE_WCHAR_T
490 memcpy(w, unicode->str, size * sizeof(wchar_t));
491#else
492 {
493 register Py_UNICODE *u;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000494 register Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000495 u = PyUnicode_AS_UNICODE(unicode);
Marc-André Lemburg204bd6d2004-10-15 07:45:05 +0000496 for (i = size; i > 0; i--)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000497 *w++ = *u++;
498 }
499#endif
500
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +0000501 if (size > PyUnicode_GET_SIZE(unicode))
502 return PyUnicode_GET_SIZE(unicode);
503 else
Guido van Rossumd57fd912000-03-10 22:53:23 +0000504 return size;
505}
506
507#endif
508
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000509PyObject *PyUnicode_FromOrdinal(int ordinal)
510{
Hye-Shik Chang40574832004-04-06 07:24:51 +0000511 Py_UNICODE s[1];
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000512
513#ifdef Py_UNICODE_WIDE
514 if (ordinal < 0 || ordinal > 0x10ffff) {
515 PyErr_SetString(PyExc_ValueError,
516 "unichr() arg not in range(0x110000) "
517 "(wide Python build)");
518 return NULL;
519 }
520#else
521 if (ordinal < 0 || ordinal > 0xffff) {
522 PyErr_SetString(PyExc_ValueError,
523 "unichr() arg not in range(0x10000) "
524 "(narrow Python build)");
525 return NULL;
526 }
527#endif
528
Hye-Shik Chang40574832004-04-06 07:24:51 +0000529 s[0] = (Py_UNICODE)ordinal;
530 return PyUnicode_FromUnicode(s, 1);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000531}
532
Guido van Rossumd57fd912000-03-10 22:53:23 +0000533PyObject *PyUnicode_FromObject(register PyObject *obj)
534{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000535 /* XXX Perhaps we should make this API an alias of
536 PyObject_Unicode() instead ?! */
537 if (PyUnicode_CheckExact(obj)) {
538 Py_INCREF(obj);
539 return obj;
540 }
541 if (PyUnicode_Check(obj)) {
542 /* For a Unicode subtype that's not a Unicode object,
543 return a true Unicode object with the same data. */
544 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(obj),
545 PyUnicode_GET_SIZE(obj));
546 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000547 return PyUnicode_FromEncodedObject(obj, NULL, "strict");
548}
549
550PyObject *PyUnicode_FromEncodedObject(register PyObject *obj,
551 const char *encoding,
552 const char *errors)
553{
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000554 const char *s = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +0000555 Py_ssize_t len;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000556 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +0000557
Guido van Rossumd57fd912000-03-10 22:53:23 +0000558 if (obj == NULL) {
559 PyErr_BadInternalCall();
560 return NULL;
561 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000562
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000563#if 0
564 /* For b/w compatibility we also accept Unicode objects provided
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +0000565 that no encodings is given and then redirect to
566 PyObject_Unicode() which then applies the additional logic for
567 Unicode subclasses.
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000568
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000569 NOTE: This API should really only be used for object which
570 represent *encoded* Unicode !
571
572 */
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000573 if (PyUnicode_Check(obj)) {
574 if (encoding) {
575 PyErr_SetString(PyExc_TypeError,
576 "decoding Unicode is not supported");
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000577 return NULL;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000578 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000579 return PyObject_Unicode(obj);
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000580 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000581#else
582 if (PyUnicode_Check(obj)) {
583 PyErr_SetString(PyExc_TypeError,
584 "decoding Unicode is not supported");
585 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000586 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000587#endif
588
589 /* Coerce object */
590 if (PyString_Check(obj)) {
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000591 s = PyString_AS_STRING(obj);
592 len = PyString_GET_SIZE(obj);
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000593 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000594 else if (PyObject_AsCharBuffer(obj, &s, &len)) {
595 /* Overwrite the error message with something more useful in
596 case of a TypeError. */
597 if (PyErr_ExceptionMatches(PyExc_TypeError))
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000598 PyErr_Format(PyExc_TypeError,
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000599 "coercing to Unicode: need string or buffer, "
600 "%.80s found",
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000601 obj->ob_type->tp_name);
602 goto onError;
603 }
Tim Petersced69f82003-09-16 20:30:58 +0000604
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000605 /* Convert to Unicode */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000606 if (len == 0) {
607 Py_INCREF(unicode_empty);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000608 v = (PyObject *)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000609 }
Tim Petersced69f82003-09-16 20:30:58 +0000610 else
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000611 v = PyUnicode_Decode(s, len, encoding, errors);
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +0000612
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000613 return v;
614
615 onError:
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000616 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000617}
618
619PyObject *PyUnicode_Decode(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000620 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +0000621 const char *encoding,
622 const char *errors)
623{
624 PyObject *buffer = NULL, *unicode;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000625
626 if (encoding == NULL)
Fred Drakee4315f52000-05-09 19:53:39 +0000627 encoding = PyUnicode_GetDefaultEncoding();
628
629 /* Shortcuts for common default encodings */
630 if (strcmp(encoding, "utf-8") == 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000631 return PyUnicode_DecodeUTF8(s, size, errors);
Fred Drakee4315f52000-05-09 19:53:39 +0000632 else if (strcmp(encoding, "latin-1") == 0)
633 return PyUnicode_DecodeLatin1(s, size, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +0000634#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
635 else if (strcmp(encoding, "mbcs") == 0)
636 return PyUnicode_DecodeMBCS(s, size, errors);
637#endif
Fred Drakee4315f52000-05-09 19:53:39 +0000638 else if (strcmp(encoding, "ascii") == 0)
639 return PyUnicode_DecodeASCII(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000640
641 /* Decode via the codec registry */
642 buffer = PyBuffer_FromMemory((void *)s, size);
643 if (buffer == NULL)
644 goto onError;
645 unicode = PyCodec_Decode(buffer, encoding, errors);
646 if (unicode == NULL)
647 goto onError;
648 if (!PyUnicode_Check(unicode)) {
649 PyErr_Format(PyExc_TypeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000650 "decoder did not return an unicode object (type=%.400s)",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000651 unicode->ob_type->tp_name);
652 Py_DECREF(unicode);
653 goto onError;
654 }
655 Py_DECREF(buffer);
656 return unicode;
Tim Petersced69f82003-09-16 20:30:58 +0000657
Guido van Rossumd57fd912000-03-10 22:53:23 +0000658 onError:
659 Py_XDECREF(buffer);
660 return NULL;
661}
662
Marc-André Lemburgd2d45982004-07-08 17:57:32 +0000663PyObject *PyUnicode_AsDecodedObject(PyObject *unicode,
664 const char *encoding,
665 const char *errors)
666{
667 PyObject *v;
668
669 if (!PyUnicode_Check(unicode)) {
670 PyErr_BadArgument();
671 goto onError;
672 }
673
674 if (encoding == NULL)
675 encoding = PyUnicode_GetDefaultEncoding();
676
677 /* Decode via the codec registry */
678 v = PyCodec_Decode(unicode, encoding, errors);
679 if (v == NULL)
680 goto onError;
681 return v;
682
683 onError:
684 return NULL;
685}
686
Guido van Rossumd57fd912000-03-10 22:53:23 +0000687PyObject *PyUnicode_Encode(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000688 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +0000689 const char *encoding,
690 const char *errors)
691{
692 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +0000693
Guido van Rossumd57fd912000-03-10 22:53:23 +0000694 unicode = PyUnicode_FromUnicode(s, size);
695 if (unicode == NULL)
696 return NULL;
697 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
698 Py_DECREF(unicode);
699 return v;
700}
701
Marc-André Lemburgd2d45982004-07-08 17:57:32 +0000702PyObject *PyUnicode_AsEncodedObject(PyObject *unicode,
703 const char *encoding,
704 const char *errors)
705{
706 PyObject *v;
707
708 if (!PyUnicode_Check(unicode)) {
709 PyErr_BadArgument();
710 goto onError;
711 }
712
713 if (encoding == NULL)
714 encoding = PyUnicode_GetDefaultEncoding();
715
716 /* Encode via the codec registry */
717 v = PyCodec_Encode(unicode, encoding, errors);
718 if (v == NULL)
719 goto onError;
720 return v;
721
722 onError:
723 return NULL;
724}
725
Guido van Rossumd57fd912000-03-10 22:53:23 +0000726PyObject *PyUnicode_AsEncodedString(PyObject *unicode,
727 const char *encoding,
728 const char *errors)
729{
730 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +0000731
Guido van Rossumd57fd912000-03-10 22:53:23 +0000732 if (!PyUnicode_Check(unicode)) {
733 PyErr_BadArgument();
734 goto onError;
735 }
Fred Drakee4315f52000-05-09 19:53:39 +0000736
Tim Petersced69f82003-09-16 20:30:58 +0000737 if (encoding == NULL)
Fred Drakee4315f52000-05-09 19:53:39 +0000738 encoding = PyUnicode_GetDefaultEncoding();
739
740 /* Shortcuts for common default encodings */
741 if (errors == NULL) {
742 if (strcmp(encoding, "utf-8") == 0)
Jeremy Hylton9cea41c2001-05-29 17:13:15 +0000743 return PyUnicode_AsUTF8String(unicode);
Fred Drakee4315f52000-05-09 19:53:39 +0000744 else if (strcmp(encoding, "latin-1") == 0)
745 return PyUnicode_AsLatin1String(unicode);
Mark Hammond0ccda1e2003-07-01 00:13:27 +0000746#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
747 else if (strcmp(encoding, "mbcs") == 0)
748 return PyUnicode_AsMBCSString(unicode);
749#endif
Fred Drakee4315f52000-05-09 19:53:39 +0000750 else if (strcmp(encoding, "ascii") == 0)
751 return PyUnicode_AsASCIIString(unicode);
752 }
Guido van Rossumd57fd912000-03-10 22:53:23 +0000753
754 /* Encode via the codec registry */
755 v = PyCodec_Encode(unicode, encoding, errors);
756 if (v == NULL)
757 goto onError;
Guido van Rossumf15a29f2007-05-04 00:41:39 +0000758 if (!PyBytes_Check(v)) {
759 if (PyString_Check(v)) {
760 /* Old codec, turn it into bytes */
761 PyObject *b = PyBytes_FromObject(v);
762 Py_DECREF(v);
763 return b;
764 }
Guido van Rossumd57fd912000-03-10 22:53:23 +0000765 PyErr_Format(PyExc_TypeError,
Guido van Rossumf15a29f2007-05-04 00:41:39 +0000766 "encoder did not return a bytes object "
767 "(type=%.400s, encoding=%.20s, errors=%.20s)",
768 v->ob_type->tp_name,
769 encoding ? encoding : "NULL",
770 errors ? errors : "NULL");
Guido van Rossumd57fd912000-03-10 22:53:23 +0000771 Py_DECREF(v);
772 goto onError;
773 }
774 return v;
Tim Petersced69f82003-09-16 20:30:58 +0000775
Guido van Rossumd57fd912000-03-10 22:53:23 +0000776 onError:
777 return NULL;
778}
779
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000780PyObject *_PyUnicode_AsDefaultEncodedString(PyObject *unicode,
781 const char *errors)
782{
783 PyObject *v = ((PyUnicodeObject *)unicode)->defenc;
Guido van Rossumf15a29f2007-05-04 00:41:39 +0000784 PyObject *b;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000785 if (v)
786 return v;
Guido van Rossumf15a29f2007-05-04 00:41:39 +0000787 if (errors != NULL)
788 Py_FatalError("non-NULL encoding in _PyUnicode_AsDefaultEncodedString");
789 if (errors == NULL) {
790 b = PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
791 PyUnicode_GET_SIZE(unicode),
792 NULL);
793 }
794 else {
795 b = PyUnicode_AsEncodedString(unicode, NULL, errors);
796 }
797 if (!b)
798 return NULL;
799 v = PyString_FromStringAndSize(PyBytes_AsString(b),
800 PyBytes_Size(b));
801 Py_DECREF(b);
802 if (!errors) {
803 Py_XINCREF(v);
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000804 ((PyUnicodeObject *)unicode)->defenc = v;
Guido van Rossumf15a29f2007-05-04 00:41:39 +0000805 }
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000806 return v;
807}
808
Guido van Rossumd57fd912000-03-10 22:53:23 +0000809Py_UNICODE *PyUnicode_AsUnicode(PyObject *unicode)
810{
811 if (!PyUnicode_Check(unicode)) {
812 PyErr_BadArgument();
813 goto onError;
814 }
815 return PyUnicode_AS_UNICODE(unicode);
816
817 onError:
818 return NULL;
819}
820
Martin v. Löwis18e16552006-02-15 17:27:45 +0000821Py_ssize_t PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000822{
823 if (!PyUnicode_Check(unicode)) {
824 PyErr_BadArgument();
825 goto onError;
826 }
827 return PyUnicode_GET_SIZE(unicode);
828
829 onError:
830 return -1;
831}
832
Thomas Wouters78890102000-07-22 19:25:51 +0000833const char *PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +0000834{
835 return unicode_default_encoding;
836}
837
838int PyUnicode_SetDefaultEncoding(const char *encoding)
839{
Guido van Rossumf15a29f2007-05-04 00:41:39 +0000840 if (strcmp(encoding, unicode_default_encoding) != 0) {
841 PyErr_Format(PyExc_ValueError,
842 "Can only set default encoding to %s",
843 unicode_default_encoding);
844 return -1;
845 }
Fred Drakee4315f52000-05-09 19:53:39 +0000846 return 0;
Fred Drakee4315f52000-05-09 19:53:39 +0000847}
848
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000849/* error handling callback helper:
850 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +0000851 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000852 and adjust various state variables.
853 return 0 on success, -1 on error
854*/
855
856static
857int unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
858 const char *encoding, const char *reason,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000859 const char *input, Py_ssize_t insize, Py_ssize_t *startinpos, Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
860 PyObject **output, Py_ssize_t *outpos, Py_UNICODE **outptr)
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000861{
Martin v. Löwis18e16552006-02-15 17:27:45 +0000862 static char *argparse = "O!n;decoding error handler must return (unicode, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000863
864 PyObject *restuple = NULL;
865 PyObject *repunicode = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +0000866 Py_ssize_t outsize = PyUnicode_GET_SIZE(*output);
867 Py_ssize_t requiredsize;
868 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000869 Py_UNICODE *repptr;
Martin v. Löwis18e16552006-02-15 17:27:45 +0000870 Py_ssize_t repsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000871 int res = -1;
872
873 if (*errorHandler == NULL) {
874 *errorHandler = PyCodec_LookupError(errors);
875 if (*errorHandler == NULL)
876 goto onError;
877 }
878
879 if (*exceptionObject == NULL) {
880 *exceptionObject = PyUnicodeDecodeError_Create(
881 encoding, input, insize, *startinpos, *endinpos, reason);
882 if (*exceptionObject == NULL)
883 goto onError;
884 }
885 else {
886 if (PyUnicodeDecodeError_SetStart(*exceptionObject, *startinpos))
887 goto onError;
888 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, *endinpos))
889 goto onError;
890 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
891 goto onError;
892 }
893
894 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
895 if (restuple == NULL)
896 goto onError;
897 if (!PyTuple_Check(restuple)) {
898 PyErr_Format(PyExc_TypeError, &argparse[4]);
899 goto onError;
900 }
901 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
902 goto onError;
903 if (newpos<0)
Walter Dörwald2e0b18a2003-01-31 17:19:08 +0000904 newpos = insize+newpos;
905 if (newpos<0 || newpos>insize) {
Martin v. Löwis2c95cc62006-02-16 06:54:25 +0000906 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
Walter Dörwald2e0b18a2003-01-31 17:19:08 +0000907 goto onError;
908 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000909
910 /* need more space? (at least enough for what we
911 have+the replacement+the rest of the string (starting
912 at the new input position), so we won't have to check space
913 when there are no errors in the rest of the string) */
914 repptr = PyUnicode_AS_UNICODE(repunicode);
915 repsize = PyUnicode_GET_SIZE(repunicode);
916 requiredsize = *outpos + repsize + insize-newpos;
917 if (requiredsize > outsize) {
918 if (requiredsize<2*outsize)
919 requiredsize = 2*outsize;
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +0000920 if (PyUnicode_Resize(output, requiredsize) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000921 goto onError;
922 *outptr = PyUnicode_AS_UNICODE(*output) + *outpos;
923 }
924 *endinpos = newpos;
925 *inptr = input + newpos;
926 Py_UNICODE_COPY(*outptr, repptr, repsize);
927 *outptr += repsize;
928 *outpos += repsize;
929 /* we made it! */
930 res = 0;
931
932 onError:
933 Py_XDECREF(restuple);
934 return res;
935}
936
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000937/* --- UTF-7 Codec -------------------------------------------------------- */
938
939/* see RFC2152 for details */
940
Tim Petersced69f82003-09-16 20:30:58 +0000941static
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000942char utf7_special[128] = {
943 /* indicate whether a UTF-7 character is special i.e. cannot be directly
944 encoded:
945 0 - not special
946 1 - special
947 2 - whitespace (optional)
948 3 - RFC2152 Set O (optional) */
949 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 2, 1, 1,
950 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
951 2, 3, 3, 3, 3, 3, 3, 0, 0, 0, 3, 1, 0, 0, 0, 1,
952 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 0,
953 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
954 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 1, 3, 3, 3,
955 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
956 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 1, 1,
957
958};
959
Marc-André Lemburge115ec82005-10-19 22:33:31 +0000960/* Note: The comparison (c) <= 0 is a trick to work-around gcc
961 warnings about the comparison always being false; since
962 utf7_special[0] is 1, we can safely make that one comparison
963 true */
964
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000965#define SPECIAL(c, encodeO, encodeWS) \
Marc-André Lemburge115ec82005-10-19 22:33:31 +0000966 ((c) > 127 || (c) <= 0 || utf7_special[(c)] == 1 || \
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +0000967 (encodeWS && (utf7_special[(c)] == 2)) || \
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000968 (encodeO && (utf7_special[(c)] == 3)))
969
Marc-André Lemburge115ec82005-10-19 22:33:31 +0000970#define B64(n) \
971 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
972#define B64CHAR(c) \
973 (isalnum(c) || (c) == '+' || (c) == '/')
974#define UB64(c) \
975 ((c) == '+' ? 62 : (c) == '/' ? 63 : (c) >= 'a' ? \
976 (c) - 71 : (c) >= 'A' ? (c) - 65 : (c) + 4 )
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000977
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +0000978#define ENCODE(out, ch, bits) \
979 while (bits >= 6) { \
980 *out++ = B64(ch >> (bits-6)); \
981 bits -= 6; \
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000982 }
983
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +0000984#define DECODE(out, ch, bits, surrogate) \
985 while (bits >= 16) { \
986 Py_UNICODE outCh = (Py_UNICODE) ((ch >> (bits-16)) & 0xffff); \
987 bits -= 16; \
988 if (surrogate) { \
Marc-André Lemburge115ec82005-10-19 22:33:31 +0000989 /* We have already generated an error for the high surrogate \
990 so let's not bother seeing if the low surrogate is correct or not */ \
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +0000991 surrogate = 0; \
992 } else if (0xDC00 <= outCh && outCh <= 0xDFFF) { \
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000993 /* This is a surrogate pair. Unfortunately we can't represent \
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +0000994 it in a 16-bit character */ \
995 surrogate = 1; \
996 errmsg = "code pairs are not supported"; \
997 goto utf7Error; \
998 } else { \
999 *out++ = outCh; \
1000 } \
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001001 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001002
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001003PyObject *PyUnicode_DecodeUTF7(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001004 Py_ssize_t size,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001005 const char *errors)
1006{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001007 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001008 Py_ssize_t startinpos;
1009 Py_ssize_t endinpos;
1010 Py_ssize_t outpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001011 const char *e;
1012 PyUnicodeObject *unicode;
1013 Py_UNICODE *p;
1014 const char *errmsg = "";
1015 int inShift = 0;
1016 unsigned int bitsleft = 0;
1017 unsigned long charsleft = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001018 int surrogate = 0;
1019 PyObject *errorHandler = NULL;
1020 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001021
1022 unicode = _PyUnicode_New(size);
1023 if (!unicode)
1024 return NULL;
1025 if (size == 0)
1026 return (PyObject *)unicode;
1027
1028 p = unicode->str;
1029 e = s + size;
1030
1031 while (s < e) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001032 Py_UNICODE ch;
1033 restart:
1034 ch = *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001035
1036 if (inShift) {
1037 if ((ch == '-') || !B64CHAR(ch)) {
1038 inShift = 0;
1039 s++;
Tim Petersced69f82003-09-16 20:30:58 +00001040
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001041 /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
1042 if (bitsleft >= 6) {
1043 /* The shift sequence has a partial character in it. If
1044 bitsleft < 6 then we could just classify it as padding
1045 but that is not the case here */
1046
1047 errmsg = "partial character in shift sequence";
Tim Petersced69f82003-09-16 20:30:58 +00001048 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001049 }
1050 /* According to RFC2152 the remaining bits should be zero. We
Tim Petersced69f82003-09-16 20:30:58 +00001051 choose to signal an error/insert a replacement character
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001052 here so indicate the potential of a misencoded character. */
1053
1054 /* On x86, a << b == a << (b%32) so make sure that bitsleft != 0 */
1055 if (bitsleft && charsleft << (sizeof(charsleft) * 8 - bitsleft)) {
1056 errmsg = "non-zero padding bits in shift sequence";
Tim Petersced69f82003-09-16 20:30:58 +00001057 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001058 }
1059
1060 if (ch == '-') {
1061 if ((s < e) && (*(s) == '-')) {
Tim Petersced69f82003-09-16 20:30:58 +00001062 *p++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001063 inShift = 1;
1064 }
1065 } else if (SPECIAL(ch,0,0)) {
1066 errmsg = "unexpected special character";
Tim Petersced69f82003-09-16 20:30:58 +00001067 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001068 } else {
1069 *p++ = ch;
1070 }
1071 } else {
1072 charsleft = (charsleft << 6) | UB64(ch);
1073 bitsleft += 6;
1074 s++;
1075 /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
1076 }
1077 }
1078 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001079 startinpos = s-starts;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001080 s++;
1081 if (s < e && *s == '-') {
1082 s++;
1083 *p++ = '+';
1084 } else
1085 {
1086 inShift = 1;
1087 bitsleft = 0;
1088 }
1089 }
1090 else if (SPECIAL(ch,0,0)) {
1091 errmsg = "unexpected special character";
1092 s++;
Tim Petersced69f82003-09-16 20:30:58 +00001093 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001094 }
1095 else {
1096 *p++ = ch;
1097 s++;
1098 }
1099 continue;
1100 utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001101 outpos = p-PyUnicode_AS_UNICODE(unicode);
1102 endinpos = s-starts;
1103 if (unicode_decode_call_errorhandler(
1104 errors, &errorHandler,
1105 "utf7", errmsg,
1106 starts, size, &startinpos, &endinpos, &exc, &s,
1107 (PyObject **)&unicode, &outpos, &p))
1108 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001109 }
1110
1111 if (inShift) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001112 outpos = p-PyUnicode_AS_UNICODE(unicode);
1113 endinpos = size;
1114 if (unicode_decode_call_errorhandler(
1115 errors, &errorHandler,
1116 "utf7", "unterminated shift sequence",
1117 starts, size, &startinpos, &endinpos, &exc, &s,
1118 (PyObject **)&unicode, &outpos, &p))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001119 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001120 if (s < e)
1121 goto restart;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001122 }
1123
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00001124 if (_PyUnicode_Resize(&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001125 goto onError;
1126
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001127 Py_XDECREF(errorHandler);
1128 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001129 return (PyObject *)unicode;
1130
1131onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001132 Py_XDECREF(errorHandler);
1133 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001134 Py_DECREF(unicode);
1135 return NULL;
1136}
1137
1138
1139PyObject *PyUnicode_EncodeUTF7(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001140 Py_ssize_t size,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001141 int encodeSetO,
1142 int encodeWhiteSpace,
1143 const char *errors)
1144{
1145 PyObject *v;
1146 /* It might be possible to tighten this worst case */
Martin v. Löwis18e16552006-02-15 17:27:45 +00001147 Py_ssize_t cbAllocated = 5 * size;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001148 int inShift = 0;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001149 Py_ssize_t i = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001150 unsigned int bitsleft = 0;
1151 unsigned long charsleft = 0;
1152 char * out;
1153 char * start;
1154
1155 if (size == 0)
Walter Dörwald51ab4142007-05-05 14:43:36 +00001156 return PyBytes_FromStringAndSize(NULL, 0);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001157
Walter Dörwald51ab4142007-05-05 14:43:36 +00001158 v = PyBytes_FromStringAndSize(NULL, cbAllocated);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001159 if (v == NULL)
1160 return NULL;
1161
Walter Dörwald51ab4142007-05-05 14:43:36 +00001162 start = out = PyBytes_AS_STRING(v);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001163 for (;i < size; ++i) {
1164 Py_UNICODE ch = s[i];
1165
1166 if (!inShift) {
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001167 if (ch == '+') {
1168 *out++ = '+';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001169 *out++ = '-';
1170 } else if (SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
1171 charsleft = ch;
1172 bitsleft = 16;
1173 *out++ = '+';
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001174 /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001175 inShift = bitsleft > 0;
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001176 } else {
1177 *out++ = (char) ch;
1178 }
1179 } else {
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001180 if (!SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
1181 *out++ = B64(charsleft << (6-bitsleft));
1182 charsleft = 0;
1183 bitsleft = 0;
1184 /* Characters not in the BASE64 set implicitly unshift the sequence
1185 so no '-' is required, except if the character is itself a '-' */
1186 if (B64CHAR(ch) || ch == '-') {
1187 *out++ = '-';
1188 }
1189 inShift = 0;
1190 *out++ = (char) ch;
1191 } else {
1192 bitsleft += 16;
1193 charsleft = (charsleft << 16) | ch;
1194 /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
1195
1196 /* If the next character is special then we dont' need to terminate
Tim Petersced69f82003-09-16 20:30:58 +00001197 the shift sequence. If the next character is not a BASE64 character
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001198 or '-' then the shift sequence will be terminated implicitly and we
1199 don't have to insert a '-'. */
1200
1201 if (bitsleft == 0) {
1202 if (i + 1 < size) {
1203 Py_UNICODE ch2 = s[i+1];
1204
1205 if (SPECIAL(ch2, encodeSetO, encodeWhiteSpace)) {
Tim Petersced69f82003-09-16 20:30:58 +00001206
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001207 } else if (B64CHAR(ch2) || ch2 == '-') {
1208 *out++ = '-';
1209 inShift = 0;
1210 } else {
1211 inShift = 0;
1212 }
1213
1214 }
1215 else {
1216 *out++ = '-';
1217 inShift = 0;
1218 }
1219 }
Tim Petersced69f82003-09-16 20:30:58 +00001220 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001221 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001222 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001223 if (bitsleft) {
1224 *out++= B64(charsleft << (6-bitsleft) );
1225 *out++ = '-';
1226 }
1227
Walter Dörwald51ab4142007-05-05 14:43:36 +00001228 if (PyBytes_Resize(v, out - start)) {
1229 Py_DECREF(v);
1230 return NULL;
1231 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001232 return v;
1233}
1234
1235#undef SPECIAL
1236#undef B64
1237#undef B64CHAR
1238#undef UB64
1239#undef ENCODE
1240#undef DECODE
1241
Guido van Rossumd57fd912000-03-10 22:53:23 +00001242/* --- UTF-8 Codec -------------------------------------------------------- */
1243
Tim Petersced69f82003-09-16 20:30:58 +00001244static
Guido van Rossumd57fd912000-03-10 22:53:23 +00001245char utf8_code_length[256] = {
1246 /* Map UTF-8 encoded prefix byte to sequence length. zero means
1247 illegal prefix. see RFC 2279 for details */
1248 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1249 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1250 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1251 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1252 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1253 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1254 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1255 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1256 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1257 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1258 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1259 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1260 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1261 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1262 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
1263 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0
1264};
1265
Guido van Rossumd57fd912000-03-10 22:53:23 +00001266PyObject *PyUnicode_DecodeUTF8(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001267 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00001268 const char *errors)
1269{
Walter Dörwald69652032004-09-07 20:24:22 +00001270 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
1271}
1272
1273PyObject *PyUnicode_DecodeUTF8Stateful(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001274 Py_ssize_t size,
Walter Dörwald69652032004-09-07 20:24:22 +00001275 const char *errors,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001276 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00001277{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001278 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001279 int n;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001280 Py_ssize_t startinpos;
1281 Py_ssize_t endinpos;
1282 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001283 const char *e;
1284 PyUnicodeObject *unicode;
1285 Py_UNICODE *p;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001286 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001287 PyObject *errorHandler = NULL;
1288 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001289
1290 /* Note: size will always be longer than the resulting Unicode
1291 character count */
1292 unicode = _PyUnicode_New(size);
1293 if (!unicode)
1294 return NULL;
Walter Dörwald69652032004-09-07 20:24:22 +00001295 if (size == 0) {
1296 if (consumed)
1297 *consumed = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001298 return (PyObject *)unicode;
Walter Dörwald69652032004-09-07 20:24:22 +00001299 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001300
1301 /* Unpack UTF-8 encoded data */
1302 p = unicode->str;
1303 e = s + size;
1304
1305 while (s < e) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001306 Py_UCS4 ch = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001307
1308 if (ch < 0x80) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001309 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001310 s++;
1311 continue;
1312 }
1313
1314 n = utf8_code_length[ch];
1315
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001316 if (s + n > e) {
Walter Dörwald69652032004-09-07 20:24:22 +00001317 if (consumed)
1318 break;
1319 else {
1320 errmsg = "unexpected end of data";
1321 startinpos = s-starts;
1322 endinpos = size;
1323 goto utf8Error;
1324 }
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001325 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001326
1327 switch (n) {
1328
1329 case 0:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001330 errmsg = "unexpected code byte";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001331 startinpos = s-starts;
1332 endinpos = startinpos+1;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001333 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001334
1335 case 1:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001336 errmsg = "internal error";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001337 startinpos = s-starts;
1338 endinpos = startinpos+1;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001339 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001340
1341 case 2:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001342 if ((s[1] & 0xc0) != 0x80) {
1343 errmsg = "invalid data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001344 startinpos = s-starts;
1345 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001346 goto utf8Error;
1347 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001348 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001349 if (ch < 0x80) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001350 startinpos = s-starts;
1351 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001352 errmsg = "illegal encoding";
1353 goto utf8Error;
1354 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001355 else
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001356 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001357 break;
1358
1359 case 3:
Tim Petersced69f82003-09-16 20:30:58 +00001360 if ((s[1] & 0xc0) != 0x80 ||
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001361 (s[2] & 0xc0) != 0x80) {
1362 errmsg = "invalid data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001363 startinpos = s-starts;
1364 endinpos = startinpos+3;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001365 goto utf8Error;
1366 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001367 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001368 if (ch < 0x0800) {
1369 /* Note: UTF-8 encodings of surrogates are considered
Tim Petersced69f82003-09-16 20:30:58 +00001370 legal UTF-8 sequences;
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001371
1372 XXX For wide builds (UCS-4) we should probably try
1373 to recombine the surrogates into a single code
1374 unit.
1375 */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001376 errmsg = "illegal encoding";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001377 startinpos = s-starts;
1378 endinpos = startinpos+3;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001379 goto utf8Error;
1380 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001381 else
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001382 *p++ = (Py_UNICODE)ch;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001383 break;
1384
1385 case 4:
1386 if ((s[1] & 0xc0) != 0x80 ||
1387 (s[2] & 0xc0) != 0x80 ||
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001388 (s[3] & 0xc0) != 0x80) {
1389 errmsg = "invalid data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001390 startinpos = s-starts;
1391 endinpos = startinpos+4;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001392 goto utf8Error;
1393 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001394 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
1395 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
1396 /* validate and convert to UTF-16 */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001397 if ((ch < 0x10000) /* minimum value allowed for 4
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001398 byte encoding */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001399 || (ch > 0x10ffff)) /* maximum value allowed for
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001400 UTF-16 */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001401 {
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001402 errmsg = "illegal encoding";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001403 startinpos = s-starts;
1404 endinpos = startinpos+4;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001405 goto utf8Error;
1406 }
Fredrik Lundh8f455852001-06-27 18:59:43 +00001407#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001408 *p++ = (Py_UNICODE)ch;
1409#else
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001410 /* compute and append the two surrogates: */
Tim Petersced69f82003-09-16 20:30:58 +00001411
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001412 /* translate from 10000..10FFFF to 0..FFFF */
1413 ch -= 0x10000;
Tim Petersced69f82003-09-16 20:30:58 +00001414
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001415 /* high surrogate = top 10 bits added to D800 */
1416 *p++ = (Py_UNICODE)(0xD800 + (ch >> 10));
Tim Petersced69f82003-09-16 20:30:58 +00001417
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001418 /* low surrogate = bottom 10 bits added to DC00 */
Fredrik Lundh45714e92001-06-26 16:39:36 +00001419 *p++ = (Py_UNICODE)(0xDC00 + (ch & 0x03FF));
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001420#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00001421 break;
1422
1423 default:
1424 /* Other sizes are only needed for UCS-4 */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001425 errmsg = "unsupported Unicode code range";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001426 startinpos = s-starts;
1427 endinpos = startinpos+n;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001428 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001429 }
1430 s += n;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001431 continue;
Tim Petersced69f82003-09-16 20:30:58 +00001432
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001433 utf8Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001434 outpos = p-PyUnicode_AS_UNICODE(unicode);
1435 if (unicode_decode_call_errorhandler(
1436 errors, &errorHandler,
1437 "utf8", errmsg,
1438 starts, size, &startinpos, &endinpos, &exc, &s,
1439 (PyObject **)&unicode, &outpos, &p))
1440 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001441 }
Walter Dörwald69652032004-09-07 20:24:22 +00001442 if (consumed)
1443 *consumed = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001444
1445 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00001446 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001447 goto onError;
1448
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001449 Py_XDECREF(errorHandler);
1450 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001451 return (PyObject *)unicode;
1452
1453onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001454 Py_XDECREF(errorHandler);
1455 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001456 Py_DECREF(unicode);
1457 return NULL;
1458}
1459
Tim Peters602f7402002-04-27 18:03:26 +00001460/* Allocation strategy: if the string is short, convert into a stack buffer
1461 and allocate exactly as much space needed at the end. Else allocate the
1462 maximum possible needed (4 result bytes per Unicode character), and return
1463 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001464*/
Tim Peters7e3d9612002-04-21 03:26:37 +00001465PyObject *
1466PyUnicode_EncodeUTF8(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001467 Py_ssize_t size,
Tim Peters7e3d9612002-04-21 03:26:37 +00001468 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001469{
Tim Peters602f7402002-04-27 18:03:26 +00001470#define MAX_SHORT_UNICHARS 300 /* largest size we'll do on the stack */
Tim Peters0eca65c2002-04-21 17:28:06 +00001471
Martin v. Löwis18e16552006-02-15 17:27:45 +00001472 Py_ssize_t i; /* index into s of next input byte */
Tim Peters602f7402002-04-27 18:03:26 +00001473 PyObject *v; /* result string object */
1474 char *p; /* next free byte in output buffer */
Martin v. Löwis18e16552006-02-15 17:27:45 +00001475 Py_ssize_t nallocated; /* number of result bytes allocated */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001476 Py_ssize_t nneeded; /* number of result bytes needed */
Tim Peters602f7402002-04-27 18:03:26 +00001477 char stackbuf[MAX_SHORT_UNICHARS * 4];
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001478
Tim Peters602f7402002-04-27 18:03:26 +00001479 assert(s != NULL);
1480 assert(size >= 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001481
Tim Peters602f7402002-04-27 18:03:26 +00001482 if (size <= MAX_SHORT_UNICHARS) {
1483 /* Write into the stack buffer; nallocated can't overflow.
1484 * At the end, we'll allocate exactly as much heap space as it
1485 * turns out we need.
1486 */
1487 nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int);
1488 v = NULL; /* will allocate after we're done */
1489 p = stackbuf;
1490 }
1491 else {
1492 /* Overallocate on the heap, and give the excess back at the end. */
1493 nallocated = size * 4;
1494 if (nallocated / 4 != size) /* overflow! */
1495 return PyErr_NoMemory();
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001496 v = PyBytes_FromStringAndSize(NULL, nallocated);
Tim Peters602f7402002-04-27 18:03:26 +00001497 if (v == NULL)
1498 return NULL;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001499 p = PyBytes_AS_STRING(v);
Tim Peters602f7402002-04-27 18:03:26 +00001500 }
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001501
Tim Peters602f7402002-04-27 18:03:26 +00001502 for (i = 0; i < size;) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001503 Py_UCS4 ch = s[i++];
Marc-André Lemburg3688a882002-02-06 18:09:02 +00001504
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001505 if (ch < 0x80)
Tim Peters602f7402002-04-27 18:03:26 +00001506 /* Encode ASCII */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001507 *p++ = (char) ch;
Marc-André Lemburg3688a882002-02-06 18:09:02 +00001508
Guido van Rossumd57fd912000-03-10 22:53:23 +00001509 else if (ch < 0x0800) {
Tim Peters602f7402002-04-27 18:03:26 +00001510 /* Encode Latin-1 */
Marc-André Lemburgdc724d62002-02-06 18:20:19 +00001511 *p++ = (char)(0xc0 | (ch >> 6));
1512 *p++ = (char)(0x80 | (ch & 0x3f));
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001513 }
Marc-André Lemburg3688a882002-02-06 18:09:02 +00001514 else {
Tim Peters602f7402002-04-27 18:03:26 +00001515 /* Encode UCS2 Unicode ordinals */
1516 if (ch < 0x10000) {
1517 /* Special case: check for high surrogate */
1518 if (0xD800 <= ch && ch <= 0xDBFF && i != size) {
1519 Py_UCS4 ch2 = s[i];
1520 /* Check for low surrogate and combine the two to
1521 form a UCS4 value */
1522 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001523 ch = ((ch - 0xD800) << 10 | (ch2 - 0xDC00)) + 0x10000;
Tim Peters602f7402002-04-27 18:03:26 +00001524 i++;
1525 goto encodeUCS4;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001526 }
Tim Peters602f7402002-04-27 18:03:26 +00001527 /* Fall through: handles isolated high surrogates */
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001528 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001529 *p++ = (char)(0xe0 | (ch >> 12));
Tim Peters602f7402002-04-27 18:03:26 +00001530 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
1531 *p++ = (char)(0x80 | (ch & 0x3f));
1532 continue;
1533 }
1534encodeUCS4:
1535 /* Encode UCS4 Unicode ordinals */
1536 *p++ = (char)(0xf0 | (ch >> 18));
1537 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
1538 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
1539 *p++ = (char)(0x80 | (ch & 0x3f));
1540 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001541 }
Tim Peters0eca65c2002-04-21 17:28:06 +00001542
Tim Peters602f7402002-04-27 18:03:26 +00001543 if (v == NULL) {
1544 /* This was stack allocated. */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001545 nneeded = p - stackbuf;
Tim Peters602f7402002-04-27 18:03:26 +00001546 assert(nneeded <= nallocated);
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001547 v = PyBytes_FromStringAndSize(stackbuf, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00001548 }
1549 else {
1550 /* Cut back to size actually needed. */
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001551 nneeded = p - PyBytes_AS_STRING(v);
Tim Peters602f7402002-04-27 18:03:26 +00001552 assert(nneeded <= nallocated);
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001553 PyBytes_Resize(v, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00001554 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001555 return v;
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001556
Tim Peters602f7402002-04-27 18:03:26 +00001557#undef MAX_SHORT_UNICHARS
Guido van Rossumd57fd912000-03-10 22:53:23 +00001558}
1559
Guido van Rossumd57fd912000-03-10 22:53:23 +00001560PyObject *PyUnicode_AsUTF8String(PyObject *unicode)
1561{
Guido van Rossumd57fd912000-03-10 22:53:23 +00001562 if (!PyUnicode_Check(unicode)) {
1563 PyErr_BadArgument();
1564 return NULL;
1565 }
Barry Warsaw2dd4abf2000-08-18 06:58:15 +00001566 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
1567 PyUnicode_GET_SIZE(unicode),
1568 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001569}
1570
1571/* --- UTF-16 Codec ------------------------------------------------------- */
1572
Tim Peters772747b2001-08-09 22:21:55 +00001573PyObject *
1574PyUnicode_DecodeUTF16(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001575 Py_ssize_t size,
Tim Peters772747b2001-08-09 22:21:55 +00001576 const char *errors,
1577 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001578{
Walter Dörwald69652032004-09-07 20:24:22 +00001579 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
1580}
1581
1582PyObject *
1583PyUnicode_DecodeUTF16Stateful(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001584 Py_ssize_t size,
Walter Dörwald69652032004-09-07 20:24:22 +00001585 const char *errors,
1586 int *byteorder,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001587 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00001588{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001589 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001590 Py_ssize_t startinpos;
1591 Py_ssize_t endinpos;
1592 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001593 PyUnicodeObject *unicode;
1594 Py_UNICODE *p;
Tim Peters772747b2001-08-09 22:21:55 +00001595 const unsigned char *q, *e;
1596 int bo = 0; /* assume native ordering by default */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001597 const char *errmsg = "";
Tim Peters772747b2001-08-09 22:21:55 +00001598 /* Offsets from q for retrieving byte pairs in the right order. */
1599#ifdef BYTEORDER_IS_LITTLE_ENDIAN
1600 int ihi = 1, ilo = 0;
1601#else
1602 int ihi = 0, ilo = 1;
1603#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001604 PyObject *errorHandler = NULL;
1605 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001606
1607 /* Note: size will always be longer than the resulting Unicode
1608 character count */
1609 unicode = _PyUnicode_New(size);
1610 if (!unicode)
1611 return NULL;
1612 if (size == 0)
1613 return (PyObject *)unicode;
1614
1615 /* Unpack UTF-16 encoded data */
1616 p = unicode->str;
Tim Peters772747b2001-08-09 22:21:55 +00001617 q = (unsigned char *)s;
1618 e = q + size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001619
1620 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00001621 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001622
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001623 /* Check for BOM marks (U+FEFF) in the input and adjust current
1624 byte order setting accordingly. In native mode, the leading BOM
1625 mark is skipped, in all other modes, it is copied to the output
1626 stream as-is (giving a ZWNBSP character). */
1627 if (bo == 0) {
Walter Dörwald69652032004-09-07 20:24:22 +00001628 if (size >= 2) {
1629 const Py_UNICODE bom = (q[ihi] << 8) | q[ilo];
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001630#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Walter Dörwald69652032004-09-07 20:24:22 +00001631 if (bom == 0xFEFF) {
1632 q += 2;
1633 bo = -1;
1634 }
1635 else if (bom == 0xFFFE) {
1636 q += 2;
1637 bo = 1;
1638 }
Tim Petersced69f82003-09-16 20:30:58 +00001639#else
Walter Dörwald69652032004-09-07 20:24:22 +00001640 if (bom == 0xFEFF) {
1641 q += 2;
1642 bo = 1;
1643 }
1644 else if (bom == 0xFFFE) {
1645 q += 2;
1646 bo = -1;
1647 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001648#endif
Walter Dörwald69652032004-09-07 20:24:22 +00001649 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001650 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001651
Tim Peters772747b2001-08-09 22:21:55 +00001652 if (bo == -1) {
1653 /* force LE */
1654 ihi = 1;
1655 ilo = 0;
1656 }
1657 else if (bo == 1) {
1658 /* force BE */
1659 ihi = 0;
1660 ilo = 1;
1661 }
1662
1663 while (q < e) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001664 Py_UNICODE ch;
Walter Dörwald69652032004-09-07 20:24:22 +00001665 /* remaining bytes at the end? (size should be even) */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001666 if (e-q<2) {
Walter Dörwald69652032004-09-07 20:24:22 +00001667 if (consumed)
1668 break;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001669 errmsg = "truncated data";
1670 startinpos = ((const char *)q)-starts;
1671 endinpos = ((const char *)e)-starts;
1672 goto utf16Error;
1673 /* The remaining input chars are ignored if the callback
1674 chooses to skip the input */
1675 }
1676 ch = (q[ihi] << 8) | q[ilo];
1677
Tim Peters772747b2001-08-09 22:21:55 +00001678 q += 2;
1679
Guido van Rossumd57fd912000-03-10 22:53:23 +00001680 if (ch < 0xD800 || ch > 0xDFFF) {
1681 *p++ = ch;
1682 continue;
1683 }
1684
1685 /* UTF-16 code pair: */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001686 if (q >= e) {
1687 errmsg = "unexpected end of data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001688 startinpos = (((const char *)q)-2)-starts;
1689 endinpos = ((const char *)e)-starts;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001690 goto utf16Error;
1691 }
Martin v. Löwisac93bc22001-06-26 22:43:40 +00001692 if (0xD800 <= ch && ch <= 0xDBFF) {
Tim Peters772747b2001-08-09 22:21:55 +00001693 Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo];
1694 q += 2;
Martin v. Löwisac93bc22001-06-26 22:43:40 +00001695 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Fredrik Lundh8f455852001-06-27 18:59:43 +00001696#ifndef Py_UNICODE_WIDE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001697 *p++ = ch;
1698 *p++ = ch2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001699#else
1700 *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001701#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001702 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001703 }
1704 else {
1705 errmsg = "illegal UTF-16 surrogate";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001706 startinpos = (((const char *)q)-4)-starts;
1707 endinpos = startinpos+2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001708 goto utf16Error;
1709 }
1710
Guido van Rossumd57fd912000-03-10 22:53:23 +00001711 }
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001712 errmsg = "illegal encoding";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001713 startinpos = (((const char *)q)-2)-starts;
1714 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001715 /* Fall through to report the error */
1716
1717 utf16Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001718 outpos = p-PyUnicode_AS_UNICODE(unicode);
1719 if (unicode_decode_call_errorhandler(
1720 errors, &errorHandler,
1721 "utf16", errmsg,
1722 starts, size, &startinpos, &endinpos, &exc, (const char **)&q,
1723 (PyObject **)&unicode, &outpos, &p))
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001724 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001725 }
1726
1727 if (byteorder)
1728 *byteorder = bo;
1729
Walter Dörwald69652032004-09-07 20:24:22 +00001730 if (consumed)
1731 *consumed = (const char *)q-starts;
1732
Guido van Rossumd57fd912000-03-10 22:53:23 +00001733 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00001734 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001735 goto onError;
1736
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001737 Py_XDECREF(errorHandler);
1738 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001739 return (PyObject *)unicode;
1740
1741onError:
1742 Py_DECREF(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001743 Py_XDECREF(errorHandler);
1744 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001745 return NULL;
1746}
1747
Tim Peters772747b2001-08-09 22:21:55 +00001748PyObject *
1749PyUnicode_EncodeUTF16(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001750 Py_ssize_t size,
Tim Peters772747b2001-08-09 22:21:55 +00001751 const char *errors,
1752 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001753{
1754 PyObject *v;
Tim Peters772747b2001-08-09 22:21:55 +00001755 unsigned char *p;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00001756#ifdef Py_UNICODE_WIDE
Tim Peters772747b2001-08-09 22:21:55 +00001757 int i, pairs;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00001758#else
1759 const int pairs = 0;
1760#endif
Tim Peters772747b2001-08-09 22:21:55 +00001761 /* Offsets from p for storing byte pairs in the right order. */
1762#ifdef BYTEORDER_IS_LITTLE_ENDIAN
1763 int ihi = 1, ilo = 0;
1764#else
1765 int ihi = 0, ilo = 1;
1766#endif
1767
1768#define STORECHAR(CH) \
1769 do { \
1770 p[ihi] = ((CH) >> 8) & 0xff; \
1771 p[ilo] = (CH) & 0xff; \
1772 p += 2; \
1773 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001774
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00001775#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001776 for (i = pairs = 0; i < size; i++)
1777 if (s[i] >= 0x10000)
1778 pairs++;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00001779#endif
Walter Dörwald3cc34522007-05-04 10:48:27 +00001780 v = PyBytes_FromStringAndSize(NULL,
Tim Peters772747b2001-08-09 22:21:55 +00001781 2 * (size + pairs + (byteorder == 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +00001782 if (v == NULL)
1783 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001784
Walter Dörwald3cc34522007-05-04 10:48:27 +00001785 p = (unsigned char *)PyBytes_AS_STRING(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001786 if (byteorder == 0)
Tim Peters772747b2001-08-09 22:21:55 +00001787 STORECHAR(0xFEFF);
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00001788 if (size == 0)
Marc-André Lemburgb7520772000-08-14 11:29:19 +00001789 return v;
Tim Peters772747b2001-08-09 22:21:55 +00001790
1791 if (byteorder == -1) {
1792 /* force LE */
1793 ihi = 1;
1794 ilo = 0;
1795 }
1796 else if (byteorder == 1) {
1797 /* force BE */
1798 ihi = 0;
1799 ilo = 1;
1800 }
1801
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001802 while (size-- > 0) {
1803 Py_UNICODE ch = *s++;
1804 Py_UNICODE ch2 = 0;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00001805#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001806 if (ch >= 0x10000) {
Tim Peters772747b2001-08-09 22:21:55 +00001807 ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF);
1808 ch = 0xD800 | ((ch-0x10000) >> 10);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001809 }
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00001810#endif
Tim Peters772747b2001-08-09 22:21:55 +00001811 STORECHAR(ch);
1812 if (ch2)
1813 STORECHAR(ch2);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001814 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001815 return v;
Tim Peters772747b2001-08-09 22:21:55 +00001816#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00001817}
1818
1819PyObject *PyUnicode_AsUTF16String(PyObject *unicode)
1820{
1821 if (!PyUnicode_Check(unicode)) {
1822 PyErr_BadArgument();
1823 return NULL;
1824 }
1825 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
1826 PyUnicode_GET_SIZE(unicode),
1827 NULL,
1828 0);
1829}
1830
1831/* --- Unicode Escape Codec ----------------------------------------------- */
1832
Fredrik Lundh06d12682001-01-24 07:59:11 +00001833static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00001834
Guido van Rossumd57fd912000-03-10 22:53:23 +00001835PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001836 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00001837 const char *errors)
1838{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001839 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001840 Py_ssize_t startinpos;
1841 Py_ssize_t endinpos;
1842 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001843 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001844 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001845 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001846 const char *end;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001847 char* message;
1848 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001849 PyObject *errorHandler = NULL;
1850 PyObject *exc = NULL;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001851
Guido van Rossumd57fd912000-03-10 22:53:23 +00001852 /* Escaped strings will always be longer than the resulting
1853 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001854 length after conversion to the true value.
1855 (but if the error callback returns a long replacement string
1856 we'll have to allocate more space) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001857 v = _PyUnicode_New(size);
1858 if (v == NULL)
1859 goto onError;
1860 if (size == 0)
1861 return (PyObject *)v;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001862
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001863 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001864 end = s + size;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001865
Guido van Rossumd57fd912000-03-10 22:53:23 +00001866 while (s < end) {
1867 unsigned char c;
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00001868 Py_UNICODE x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001869 int digits;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001870
1871 /* Non-escape characters are interpreted as Unicode ordinals */
1872 if (*s != '\\') {
Fredrik Lundhccc74732001-02-18 22:13:49 +00001873 *p++ = (unsigned char) *s++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001874 continue;
1875 }
1876
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001877 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001878 /* \ - Escapes */
1879 s++;
1880 switch (*s++) {
1881
1882 /* \x escapes */
1883 case '\n': break;
1884 case '\\': *p++ = '\\'; break;
1885 case '\'': *p++ = '\''; break;
1886 case '\"': *p++ = '\"'; break;
1887 case 'b': *p++ = '\b'; break;
1888 case 'f': *p++ = '\014'; break; /* FF */
1889 case 't': *p++ = '\t'; break;
1890 case 'n': *p++ = '\n'; break;
1891 case 'r': *p++ = '\r'; break;
1892 case 'v': *p++ = '\013'; break; /* VT */
1893 case 'a': *p++ = '\007'; break; /* BEL, not classic C */
1894
1895 /* \OOO (octal) escapes */
1896 case '0': case '1': case '2': case '3':
1897 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001898 x = s[-1] - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00001899 if ('0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001900 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00001901 if ('0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001902 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00001903 }
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001904 *p++ = x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001905 break;
1906
Fredrik Lundhccc74732001-02-18 22:13:49 +00001907 /* hex escapes */
1908 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001909 case 'x':
Fredrik Lundhccc74732001-02-18 22:13:49 +00001910 digits = 2;
1911 message = "truncated \\xXX escape";
1912 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001913
Fredrik Lundhccc74732001-02-18 22:13:49 +00001914 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001915 case 'u':
Fredrik Lundhccc74732001-02-18 22:13:49 +00001916 digits = 4;
1917 message = "truncated \\uXXXX escape";
1918 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001919
Fredrik Lundhccc74732001-02-18 22:13:49 +00001920 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00001921 case 'U':
Fredrik Lundhccc74732001-02-18 22:13:49 +00001922 digits = 8;
1923 message = "truncated \\UXXXXXXXX escape";
1924 hexescape:
1925 chr = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001926 outpos = p-PyUnicode_AS_UNICODE(v);
1927 if (s+digits>end) {
1928 endinpos = size;
1929 if (unicode_decode_call_errorhandler(
1930 errors, &errorHandler,
1931 "unicodeescape", "end of string in escape sequence",
1932 starts, size, &startinpos, &endinpos, &exc, &s,
1933 (PyObject **)&v, &outpos, &p))
1934 goto onError;
1935 goto nextByte;
1936 }
1937 for (i = 0; i < digits; ++i) {
Fredrik Lundhccc74732001-02-18 22:13:49 +00001938 c = (unsigned char) s[i];
Fredrik Lundhdf846752000-09-03 11:29:49 +00001939 if (!isxdigit(c)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001940 endinpos = (s+i+1)-starts;
1941 if (unicode_decode_call_errorhandler(
1942 errors, &errorHandler,
1943 "unicodeescape", message,
1944 starts, size, &startinpos, &endinpos, &exc, &s,
1945 (PyObject **)&v, &outpos, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00001946 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001947 goto nextByte;
Fredrik Lundhdf846752000-09-03 11:29:49 +00001948 }
1949 chr = (chr<<4) & ~0xF;
1950 if (c >= '0' && c <= '9')
1951 chr += c - '0';
1952 else if (c >= 'a' && c <= 'f')
1953 chr += 10 + c - 'a';
1954 else
1955 chr += 10 + c - 'A';
1956 }
1957 s += i;
Jeremy Hylton504de6b2003-10-06 05:08:26 +00001958 if (chr == 0xffffffff && PyErr_Occurred())
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001959 /* _decoding_error will have already written into the
1960 target buffer. */
1961 break;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001962 store:
Fredrik Lundhdf846752000-09-03 11:29:49 +00001963 /* when we get here, chr is a 32-bit unicode character */
1964 if (chr <= 0xffff)
1965 /* UCS-2 character */
1966 *p++ = (Py_UNICODE) chr;
1967 else if (chr <= 0x10ffff) {
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001968 /* UCS-4 character. Either store directly, or as
Walter Dörwald8c077222002-03-25 11:16:18 +00001969 surrogate pair. */
Fredrik Lundh8f455852001-06-27 18:59:43 +00001970#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001971 *p++ = chr;
1972#else
Fredrik Lundhdf846752000-09-03 11:29:49 +00001973 chr -= 0x10000L;
1974 *p++ = 0xD800 + (Py_UNICODE) (chr >> 10);
Fredrik Lundh45714e92001-06-26 16:39:36 +00001975 *p++ = 0xDC00 + (Py_UNICODE) (chr & 0x03FF);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001976#endif
Fredrik Lundhdf846752000-09-03 11:29:49 +00001977 } else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001978 endinpos = s-starts;
1979 outpos = p-PyUnicode_AS_UNICODE(v);
1980 if (unicode_decode_call_errorhandler(
1981 errors, &errorHandler,
1982 "unicodeescape", "illegal Unicode character",
1983 starts, size, &startinpos, &endinpos, &exc, &s,
1984 (PyObject **)&v, &outpos, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00001985 goto onError;
1986 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00001987 break;
1988
1989 /* \N{name} */
1990 case 'N':
1991 message = "malformed \\N character escape";
1992 if (ucnhash_CAPI == NULL) {
1993 /* load the unicode data module */
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00001994 PyObject *m, *api;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001995 m = PyImport_ImportModule("unicodedata");
1996 if (m == NULL)
1997 goto ucnhashError;
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00001998 api = PyObject_GetAttrString(m, "ucnhash_CAPI");
Fredrik Lundhccc74732001-02-18 22:13:49 +00001999 Py_DECREF(m);
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00002000 if (api == NULL)
Fredrik Lundhccc74732001-02-18 22:13:49 +00002001 goto ucnhashError;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002002 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCObject_AsVoidPtr(api);
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00002003 Py_DECREF(api);
Fredrik Lundhccc74732001-02-18 22:13:49 +00002004 if (ucnhash_CAPI == NULL)
2005 goto ucnhashError;
2006 }
2007 if (*s == '{') {
2008 const char *start = s+1;
2009 /* look for the closing brace */
2010 while (*s != '}' && s < end)
2011 s++;
2012 if (s > start && s < end && *s == '}') {
2013 /* found a name. look it up in the unicode database */
2014 message = "unknown Unicode character name";
2015 s++;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +00002016 if (ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1), &chr))
Fredrik Lundhccc74732001-02-18 22:13:49 +00002017 goto store;
2018 }
2019 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002020 endinpos = s-starts;
2021 outpos = p-PyUnicode_AS_UNICODE(v);
2022 if (unicode_decode_call_errorhandler(
2023 errors, &errorHandler,
2024 "unicodeescape", message,
2025 starts, size, &startinpos, &endinpos, &exc, &s,
2026 (PyObject **)&v, &outpos, &p))
Fredrik Lundhccc74732001-02-18 22:13:49 +00002027 goto onError;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002028 break;
2029
2030 default:
Walter Dörwald8c077222002-03-25 11:16:18 +00002031 if (s > end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002032 message = "\\ at end of string";
2033 s--;
2034 endinpos = s-starts;
2035 outpos = p-PyUnicode_AS_UNICODE(v);
2036 if (unicode_decode_call_errorhandler(
2037 errors, &errorHandler,
2038 "unicodeescape", message,
2039 starts, size, &startinpos, &endinpos, &exc, &s,
2040 (PyObject **)&v, &outpos, &p))
Walter Dörwald8c077222002-03-25 11:16:18 +00002041 goto onError;
2042 }
2043 else {
2044 *p++ = '\\';
2045 *p++ = (unsigned char)s[-1];
2046 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00002047 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002048 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002049 nextByte:
2050 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002051 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002052 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002053 goto onError;
Walter Dörwaldd4ade082003-08-15 15:00:26 +00002054 Py_XDECREF(errorHandler);
2055 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002056 return (PyObject *)v;
Walter Dörwald8c077222002-03-25 11:16:18 +00002057
Fredrik Lundhccc74732001-02-18 22:13:49 +00002058ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00002059 PyErr_SetString(
2060 PyExc_UnicodeError,
2061 "\\N escapes not supported (can't load unicodedata module)"
2062 );
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00002063 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002064 Py_XDECREF(errorHandler);
2065 Py_XDECREF(exc);
Fredrik Lundhf6056062001-01-20 11:15:25 +00002066 return NULL;
2067
Fredrik Lundhccc74732001-02-18 22:13:49 +00002068onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00002069 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002070 Py_XDECREF(errorHandler);
2071 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002072 return NULL;
2073}
2074
2075/* Return a Unicode-Escape string version of the Unicode object.
2076
2077 If quotes is true, the string is enclosed in u"" or u'' quotes as
2078 appropriate.
2079
2080*/
2081
Thomas Wouters477c8d52006-05-27 19:21:47 +00002082Py_LOCAL_INLINE(const Py_UNICODE *) findchar(const Py_UNICODE *s,
2083 Py_ssize_t size,
2084 Py_UNICODE ch)
2085{
2086 /* like wcschr, but doesn't stop at NULL characters */
2087
2088 while (size-- > 0) {
2089 if (*s == ch)
2090 return s;
2091 s++;
2092 }
2093
2094 return NULL;
2095}
Barry Warsaw51ac5802000-03-20 16:36:48 +00002096
Guido van Rossumd57fd912000-03-10 22:53:23 +00002097static
2098PyObject *unicodeescape_string(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002099 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00002100 int quotes)
2101{
2102 PyObject *repr;
2103 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002104
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00002105 static const char *hexdigit = "0123456789abcdef";
Guido van Rossumd57fd912000-03-10 22:53:23 +00002106
Thomas Wouters89f507f2006-12-13 04:49:30 +00002107 /* XXX(nnorwitz): rather than over-allocating, it would be
2108 better to choose a different scheme. Perhaps scan the
2109 first N-chars of the string and allocate based on that size.
2110 */
2111 /* Initial allocation is based on the longest-possible unichr
2112 escape.
2113
2114 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
2115 unichr, so in this case it's the longest unichr escape. In
2116 narrow (UTF-16) builds this is five chars per source unichr
2117 since there are two unichrs in the surrogate pair, so in narrow
2118 (UTF-16) builds it's not the longest unichr escape.
2119
2120 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
2121 so in the narrow (UTF-16) build case it's the longest unichr
2122 escape.
2123 */
2124
2125 repr = PyString_FromStringAndSize(NULL,
2126 2
2127#ifdef Py_UNICODE_WIDE
2128 + 10*size
2129#else
2130 + 6*size
2131#endif
2132 + 1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002133 if (repr == NULL)
2134 return NULL;
2135
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002136 p = PyString_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002137
2138 if (quotes) {
Tim Petersced69f82003-09-16 20:30:58 +00002139 *p++ = (findchar(s, size, '\'') &&
Guido van Rossumd57fd912000-03-10 22:53:23 +00002140 !findchar(s, size, '"')) ? '"' : '\'';
2141 }
2142 while (size-- > 0) {
2143 Py_UNICODE ch = *s++;
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002144
Hye-Shik Chang835b2432005-12-17 04:38:31 +00002145 /* Escape quotes and backslashes */
2146 if ((quotes &&
Guido van Rossum572dbf82007-04-27 23:53:51 +00002147 ch == (Py_UNICODE) PyString_AS_STRING(repr)[0]) || ch == '\\') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002148 *p++ = '\\';
2149 *p++ = (char) ch;
Guido van Rossumad9744a2001-09-21 15:38:17 +00002150 continue;
Tim Petersced69f82003-09-16 20:30:58 +00002151 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002152
Guido van Rossum0d42e0c2001-07-20 16:36:21 +00002153#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002154 /* Map 21-bit characters to '\U00xxxxxx' */
2155 else if (ch >= 0x10000) {
2156 *p++ = '\\';
2157 *p++ = 'U';
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002158 *p++ = hexdigit[(ch >> 28) & 0x0000000F];
2159 *p++ = hexdigit[(ch >> 24) & 0x0000000F];
2160 *p++ = hexdigit[(ch >> 20) & 0x0000000F];
2161 *p++ = hexdigit[(ch >> 16) & 0x0000000F];
2162 *p++ = hexdigit[(ch >> 12) & 0x0000000F];
2163 *p++ = hexdigit[(ch >> 8) & 0x0000000F];
2164 *p++ = hexdigit[(ch >> 4) & 0x0000000F];
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002165 *p++ = hexdigit[ch & 0x0000000F];
2166 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002167 }
Thomas Wouters89f507f2006-12-13 04:49:30 +00002168#else
2169 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002170 else if (ch >= 0xD800 && ch < 0xDC00) {
2171 Py_UNICODE ch2;
2172 Py_UCS4 ucs;
Tim Petersced69f82003-09-16 20:30:58 +00002173
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002174 ch2 = *s++;
2175 size--;
2176 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
2177 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
2178 *p++ = '\\';
2179 *p++ = 'U';
2180 *p++ = hexdigit[(ucs >> 28) & 0x0000000F];
2181 *p++ = hexdigit[(ucs >> 24) & 0x0000000F];
2182 *p++ = hexdigit[(ucs >> 20) & 0x0000000F];
2183 *p++ = hexdigit[(ucs >> 16) & 0x0000000F];
2184 *p++ = hexdigit[(ucs >> 12) & 0x0000000F];
2185 *p++ = hexdigit[(ucs >> 8) & 0x0000000F];
2186 *p++ = hexdigit[(ucs >> 4) & 0x0000000F];
2187 *p++ = hexdigit[ucs & 0x0000000F];
2188 continue;
2189 }
2190 /* Fall through: isolated surrogates are copied as-is */
2191 s--;
2192 size++;
2193 }
Thomas Wouters89f507f2006-12-13 04:49:30 +00002194#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002195
Guido van Rossumd57fd912000-03-10 22:53:23 +00002196 /* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002197 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002198 *p++ = '\\';
2199 *p++ = 'u';
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002200 *p++ = hexdigit[(ch >> 12) & 0x000F];
2201 *p++ = hexdigit[(ch >> 8) & 0x000F];
2202 *p++ = hexdigit[(ch >> 4) & 0x000F];
2203 *p++ = hexdigit[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00002204 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002205
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00002206 /* Map special whitespace to '\t', \n', '\r' */
2207 else if (ch == '\t') {
2208 *p++ = '\\';
2209 *p++ = 't';
2210 }
2211 else if (ch == '\n') {
2212 *p++ = '\\';
2213 *p++ = 'n';
2214 }
2215 else if (ch == '\r') {
2216 *p++ = '\\';
2217 *p++ = 'r';
2218 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002219
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00002220 /* Map non-printable US ASCII to '\xhh' */
Marc-André Lemburg11326de2001-11-28 12:56:20 +00002221 else if (ch < ' ' || ch >= 0x7F) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002222 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00002223 *p++ = 'x';
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002224 *p++ = hexdigit[(ch >> 4) & 0x000F];
2225 *p++ = hexdigit[ch & 0x000F];
Tim Petersced69f82003-09-16 20:30:58 +00002226 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002227
Guido van Rossumd57fd912000-03-10 22:53:23 +00002228 /* Copy everything else as-is */
2229 else
2230 *p++ = (char) ch;
2231 }
2232 if (quotes)
Guido van Rossum572dbf82007-04-27 23:53:51 +00002233 *p++ = PyString_AS_STRING(repr)[0];
Guido van Rossumd57fd912000-03-10 22:53:23 +00002234
2235 *p = '\0';
Tim Peters5de98422002-04-27 18:44:32 +00002236 _PyString_Resize(&repr, p - PyString_AS_STRING(repr));
Guido van Rossumd57fd912000-03-10 22:53:23 +00002237 return repr;
2238}
2239
2240PyObject *PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002241 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002242{
2243 return unicodeescape_string(s, size, 0);
2244}
2245
2246PyObject *PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
2247{
2248 if (!PyUnicode_Check(unicode)) {
2249 PyErr_BadArgument();
2250 return NULL;
2251 }
2252 return PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
2253 PyUnicode_GET_SIZE(unicode));
2254}
2255
2256/* --- Raw Unicode Escape Codec ------------------------------------------- */
2257
2258PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002259 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00002260 const char *errors)
2261{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002262 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002263 Py_ssize_t startinpos;
2264 Py_ssize_t endinpos;
2265 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002266 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002267 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002268 const char *end;
2269 const char *bs;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002270 PyObject *errorHandler = NULL;
2271 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00002272
Guido van Rossumd57fd912000-03-10 22:53:23 +00002273 /* Escaped strings will always be longer than the resulting
2274 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002275 length after conversion to the true value. (But decoding error
2276 handler might have to resize the string) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002277 v = _PyUnicode_New(size);
2278 if (v == NULL)
2279 goto onError;
2280 if (size == 0)
2281 return (PyObject *)v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002282 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002283 end = s + size;
2284 while (s < end) {
2285 unsigned char c;
Martin v. Löwis047c05e2002-03-21 08:55:28 +00002286 Py_UCS4 x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002287 int i;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002288 int count;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002289
2290 /* Non-escape characters are interpreted as Unicode ordinals */
2291 if (*s != '\\') {
2292 *p++ = (unsigned char)*s++;
2293 continue;
2294 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002295 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002296
2297 /* \u-escapes are only interpreted iff the number of leading
2298 backslashes if odd */
2299 bs = s;
2300 for (;s < end;) {
2301 if (*s != '\\')
2302 break;
2303 *p++ = (unsigned char)*s++;
2304 }
2305 if (((s - bs) & 1) == 0 ||
2306 s >= end ||
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002307 (*s != 'u' && *s != 'U')) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002308 continue;
2309 }
2310 p--;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002311 count = *s=='u' ? 4 : 8;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002312 s++;
2313
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002314 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002315 outpos = p-PyUnicode_AS_UNICODE(v);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002316 for (x = 0, i = 0; i < count; ++i, ++s) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002317 c = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002318 if (!isxdigit(c)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002319 endinpos = s-starts;
2320 if (unicode_decode_call_errorhandler(
2321 errors, &errorHandler,
2322 "rawunicodeescape", "truncated \\uXXXX",
2323 starts, size, &startinpos, &endinpos, &exc, &s,
2324 (PyObject **)&v, &outpos, &p))
Guido van Rossumd57fd912000-03-10 22:53:23 +00002325 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002326 goto nextByte;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002327 }
2328 x = (x<<4) & ~0xF;
2329 if (c >= '0' && c <= '9')
2330 x += c - '0';
2331 else if (c >= 'a' && c <= 'f')
2332 x += 10 + c - 'a';
2333 else
2334 x += 10 + c - 'A';
2335 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002336#ifndef Py_UNICODE_WIDE
2337 if (x > 0x10000) {
2338 if (unicode_decode_call_errorhandler(
2339 errors, &errorHandler,
2340 "rawunicodeescape", "\\Uxxxxxxxx out of range",
2341 starts, size, &startinpos, &endinpos, &exc, &s,
2342 (PyObject **)&v, &outpos, &p))
2343 goto onError;
2344 }
2345#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002346 *p++ = x;
2347 nextByte:
2348 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002349 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002350 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002351 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002352 Py_XDECREF(errorHandler);
2353 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002354 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00002355
Guido van Rossumd57fd912000-03-10 22:53:23 +00002356 onError:
2357 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002358 Py_XDECREF(errorHandler);
2359 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002360 return NULL;
2361}
2362
2363PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002364 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002365{
2366 PyObject *repr;
2367 char *p;
2368 char *q;
2369
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00002370 static const char *hexdigit = "0123456789abcdef";
Guido van Rossumd57fd912000-03-10 22:53:23 +00002371
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002372#ifdef Py_UNICODE_WIDE
2373 repr = PyString_FromStringAndSize(NULL, 10 * size);
2374#else
Guido van Rossumd57fd912000-03-10 22:53:23 +00002375 repr = PyString_FromStringAndSize(NULL, 6 * size);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002376#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00002377 if (repr == NULL)
2378 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00002379 if (size == 0)
2380 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002381
2382 p = q = PyString_AS_STRING(repr);
2383 while (size-- > 0) {
2384 Py_UNICODE ch = *s++;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002385#ifdef Py_UNICODE_WIDE
2386 /* Map 32-bit characters to '\Uxxxxxxxx' */
2387 if (ch >= 0x10000) {
2388 *p++ = '\\';
2389 *p++ = 'U';
2390 *p++ = hexdigit[(ch >> 28) & 0xf];
2391 *p++ = hexdigit[(ch >> 24) & 0xf];
2392 *p++ = hexdigit[(ch >> 20) & 0xf];
2393 *p++ = hexdigit[(ch >> 16) & 0xf];
2394 *p++ = hexdigit[(ch >> 12) & 0xf];
2395 *p++ = hexdigit[(ch >> 8) & 0xf];
2396 *p++ = hexdigit[(ch >> 4) & 0xf];
2397 *p++ = hexdigit[ch & 15];
Tim Petersced69f82003-09-16 20:30:58 +00002398 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002399 else
2400#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00002401 /* Map 16-bit characters to '\uxxxx' */
2402 if (ch >= 256) {
2403 *p++ = '\\';
2404 *p++ = 'u';
2405 *p++ = hexdigit[(ch >> 12) & 0xf];
2406 *p++ = hexdigit[(ch >> 8) & 0xf];
2407 *p++ = hexdigit[(ch >> 4) & 0xf];
2408 *p++ = hexdigit[ch & 15];
2409 }
2410 /* Copy everything else as-is */
2411 else
2412 *p++ = (char) ch;
2413 }
2414 *p = '\0';
Tim Peters5de98422002-04-27 18:44:32 +00002415 _PyString_Resize(&repr, p - q);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002416 return repr;
2417}
2418
2419PyObject *PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
2420{
2421 if (!PyUnicode_Check(unicode)) {
2422 PyErr_BadArgument();
2423 return NULL;
2424 }
2425 return PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
2426 PyUnicode_GET_SIZE(unicode));
2427}
2428
Walter Dörwalda47d1c02005-08-30 10:23:14 +00002429/* --- Unicode Internal Codec ------------------------------------------- */
2430
2431PyObject *_PyUnicode_DecodeUnicodeInternal(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002432 Py_ssize_t size,
Walter Dörwalda47d1c02005-08-30 10:23:14 +00002433 const char *errors)
2434{
2435 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002436 Py_ssize_t startinpos;
2437 Py_ssize_t endinpos;
2438 Py_ssize_t outpos;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00002439 PyUnicodeObject *v;
2440 Py_UNICODE *p;
2441 const char *end;
2442 const char *reason;
2443 PyObject *errorHandler = NULL;
2444 PyObject *exc = NULL;
2445
Neal Norwitzd43069c2006-01-08 01:12:10 +00002446#ifdef Py_UNICODE_WIDE
2447 Py_UNICODE unimax = PyUnicode_GetMax();
2448#endif
2449
Thomas Wouters89f507f2006-12-13 04:49:30 +00002450 /* XXX overflow detection missing */
Walter Dörwalda47d1c02005-08-30 10:23:14 +00002451 v = _PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE);
2452 if (v == NULL)
2453 goto onError;
2454 if (PyUnicode_GetSize((PyObject *)v) == 0)
2455 return (PyObject *)v;
2456 p = PyUnicode_AS_UNICODE(v);
2457 end = s + size;
2458
2459 while (s < end) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00002460 memcpy(p, s, sizeof(Py_UNICODE));
Walter Dörwalda47d1c02005-08-30 10:23:14 +00002461 /* We have to sanity check the raw data, otherwise doom looms for
2462 some malformed UCS-4 data. */
2463 if (
2464 #ifdef Py_UNICODE_WIDE
2465 *p > unimax || *p < 0 ||
2466 #endif
2467 end-s < Py_UNICODE_SIZE
2468 )
2469 {
2470 startinpos = s - starts;
2471 if (end-s < Py_UNICODE_SIZE) {
2472 endinpos = end-starts;
2473 reason = "truncated input";
2474 }
2475 else {
2476 endinpos = s - starts + Py_UNICODE_SIZE;
2477 reason = "illegal code point (> 0x10FFFF)";
2478 }
2479 outpos = p - PyUnicode_AS_UNICODE(v);
2480 if (unicode_decode_call_errorhandler(
2481 errors, &errorHandler,
2482 "unicode_internal", reason,
2483 starts, size, &startinpos, &endinpos, &exc, &s,
2484 (PyObject **)&v, &outpos, &p)) {
2485 goto onError;
2486 }
2487 }
2488 else {
2489 p++;
2490 s += Py_UNICODE_SIZE;
2491 }
2492 }
2493
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002494 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00002495 goto onError;
2496 Py_XDECREF(errorHandler);
2497 Py_XDECREF(exc);
2498 return (PyObject *)v;
2499
2500 onError:
2501 Py_XDECREF(v);
2502 Py_XDECREF(errorHandler);
2503 Py_XDECREF(exc);
2504 return NULL;
2505}
2506
Guido van Rossumd57fd912000-03-10 22:53:23 +00002507/* --- Latin-1 Codec ------------------------------------------------------ */
2508
2509PyObject *PyUnicode_DecodeLatin1(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002510 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00002511 const char *errors)
2512{
2513 PyUnicodeObject *v;
2514 Py_UNICODE *p;
Tim Petersced69f82003-09-16 20:30:58 +00002515
Guido van Rossumd57fd912000-03-10 22:53:23 +00002516 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002517 if (size == 1) {
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002518 Py_UNICODE r = *(unsigned char*)s;
2519 return PyUnicode_FromUnicode(&r, 1);
2520 }
2521
Guido van Rossumd57fd912000-03-10 22:53:23 +00002522 v = _PyUnicode_New(size);
2523 if (v == NULL)
2524 goto onError;
2525 if (size == 0)
2526 return (PyObject *)v;
2527 p = PyUnicode_AS_UNICODE(v);
2528 while (size-- > 0)
2529 *p++ = (unsigned char)*s++;
2530 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00002531
Guido van Rossumd57fd912000-03-10 22:53:23 +00002532 onError:
2533 Py_XDECREF(v);
2534 return NULL;
2535}
2536
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002537/* create or adjust a UnicodeEncodeError */
2538static void make_encode_exception(PyObject **exceptionObject,
2539 const char *encoding,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002540 const Py_UNICODE *unicode, Py_ssize_t size,
2541 Py_ssize_t startpos, Py_ssize_t endpos,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002542 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002543{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002544 if (*exceptionObject == NULL) {
2545 *exceptionObject = PyUnicodeEncodeError_Create(
2546 encoding, unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002547 }
2548 else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002549 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
2550 goto onError;
2551 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
2552 goto onError;
2553 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
2554 goto onError;
2555 return;
2556 onError:
2557 Py_DECREF(*exceptionObject);
2558 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002559 }
2560}
2561
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002562/* raises a UnicodeEncodeError */
2563static void raise_encode_exception(PyObject **exceptionObject,
2564 const char *encoding,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002565 const Py_UNICODE *unicode, Py_ssize_t size,
2566 Py_ssize_t startpos, Py_ssize_t endpos,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002567 const char *reason)
2568{
2569 make_encode_exception(exceptionObject,
2570 encoding, unicode, size, startpos, endpos, reason);
2571 if (*exceptionObject != NULL)
2572 PyCodec_StrictErrors(*exceptionObject);
2573}
2574
2575/* error handling callback helper:
2576 build arguments, call the callback and check the arguments,
2577 put the result into newpos and return the replacement string, which
2578 has to be freed by the caller */
2579static PyObject *unicode_encode_call_errorhandler(const char *errors,
2580 PyObject **errorHandler,
2581 const char *encoding, const char *reason,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002582 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
2583 Py_ssize_t startpos, Py_ssize_t endpos,
2584 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002585{
Martin v. Löwis18e16552006-02-15 17:27:45 +00002586 static char *argparse = "O!n;encoding error handler must return (unicode, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002587
2588 PyObject *restuple;
2589 PyObject *resunicode;
2590
2591 if (*errorHandler == NULL) {
2592 *errorHandler = PyCodec_LookupError(errors);
2593 if (*errorHandler == NULL)
2594 return NULL;
2595 }
2596
2597 make_encode_exception(exceptionObject,
2598 encoding, unicode, size, startpos, endpos, reason);
2599 if (*exceptionObject == NULL)
2600 return NULL;
2601
2602 restuple = PyObject_CallFunctionObjArgs(
2603 *errorHandler, *exceptionObject, NULL);
2604 if (restuple == NULL)
2605 return NULL;
2606 if (!PyTuple_Check(restuple)) {
2607 PyErr_Format(PyExc_TypeError, &argparse[4]);
2608 Py_DECREF(restuple);
2609 return NULL;
2610 }
2611 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
2612 &resunicode, newpos)) {
2613 Py_DECREF(restuple);
2614 return NULL;
2615 }
2616 if (*newpos<0)
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00002617 *newpos = size+*newpos;
2618 if (*newpos<0 || *newpos>size) {
Martin v. Löwis2c95cc62006-02-16 06:54:25 +00002619 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00002620 Py_DECREF(restuple);
2621 return NULL;
2622 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002623 Py_INCREF(resunicode);
2624 Py_DECREF(restuple);
2625 return resunicode;
2626}
2627
2628static PyObject *unicode_encode_ucs1(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002629 Py_ssize_t size,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002630 const char *errors,
2631 int limit)
2632{
2633 /* output object */
2634 PyObject *res;
2635 /* pointers to the beginning and end+1 of input */
2636 const Py_UNICODE *startp = p;
2637 const Py_UNICODE *endp = p + size;
2638 /* pointer to the beginning of the unencodable characters */
2639 /* const Py_UNICODE *badp = NULL; */
2640 /* pointer into the output */
2641 char *str;
2642 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00002643 Py_ssize_t respos = 0;
2644 Py_ssize_t ressize;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002645 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
2646 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002647 PyObject *errorHandler = NULL;
2648 PyObject *exc = NULL;
2649 /* the following variable is used for caching string comparisons
2650 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
2651 int known_errorHandler = -1;
2652
2653 /* allocate enough for a simple encoding without
2654 replacements, if we need more, we'll resize */
Guido van Rossumf15a29f2007-05-04 00:41:39 +00002655 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002656 if (res == NULL)
2657 goto onError;
2658 if (size == 0)
2659 return res;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00002660 str = PyBytes_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002661 ressize = size;
2662
2663 while (p<endp) {
2664 Py_UNICODE c = *p;
2665
2666 /* can we encode this? */
2667 if (c<limit) {
2668 /* no overflow check, because we know that the space is enough */
2669 *str++ = (char)c;
2670 ++p;
2671 }
2672 else {
Martin v. Löwis18e16552006-02-15 17:27:45 +00002673 Py_ssize_t unicodepos = p-startp;
2674 Py_ssize_t requiredsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002675 PyObject *repunicode;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002676 Py_ssize_t repsize;
2677 Py_ssize_t newpos;
2678 Py_ssize_t respos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002679 Py_UNICODE *uni2;
2680 /* startpos for collecting unencodable chars */
2681 const Py_UNICODE *collstart = p;
2682 const Py_UNICODE *collend = p;
2683 /* find all unecodable characters */
2684 while ((collend < endp) && ((*collend)>=limit))
2685 ++collend;
2686 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
2687 if (known_errorHandler==-1) {
2688 if ((errors==NULL) || (!strcmp(errors, "strict")))
2689 known_errorHandler = 1;
2690 else if (!strcmp(errors, "replace"))
2691 known_errorHandler = 2;
2692 else if (!strcmp(errors, "ignore"))
2693 known_errorHandler = 3;
2694 else if (!strcmp(errors, "xmlcharrefreplace"))
2695 known_errorHandler = 4;
2696 else
2697 known_errorHandler = 0;
2698 }
2699 switch (known_errorHandler) {
2700 case 1: /* strict */
2701 raise_encode_exception(&exc, encoding, startp, size, collstart-startp, collend-startp, reason);
2702 goto onError;
2703 case 2: /* replace */
2704 while (collstart++<collend)
2705 *str++ = '?'; /* fall through */
2706 case 3: /* ignore */
2707 p = collend;
2708 break;
2709 case 4: /* xmlcharrefreplace */
Guido van Rossumf15a29f2007-05-04 00:41:39 +00002710 respos = str - PyBytes_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002711 /* determine replacement size (temporarily (mis)uses p) */
2712 for (p = collstart, repsize = 0; p < collend; ++p) {
2713 if (*p<10)
2714 repsize += 2+1+1;
2715 else if (*p<100)
2716 repsize += 2+2+1;
2717 else if (*p<1000)
2718 repsize += 2+3+1;
2719 else if (*p<10000)
2720 repsize += 2+4+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00002721#ifndef Py_UNICODE_WIDE
2722 else
2723 repsize += 2+5+1;
2724#else
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002725 else if (*p<100000)
2726 repsize += 2+5+1;
2727 else if (*p<1000000)
2728 repsize += 2+6+1;
2729 else
2730 repsize += 2+7+1;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002731#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002732 }
2733 requiredsize = respos+repsize+(endp-collend);
2734 if (requiredsize > ressize) {
2735 if (requiredsize<2*ressize)
2736 requiredsize = 2*ressize;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00002737 if (PyBytes_Resize(res, requiredsize))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002738 goto onError;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00002739 str = PyBytes_AS_STRING(res) + respos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002740 ressize = requiredsize;
2741 }
2742 /* generate replacement (temporarily (mis)uses p) */
2743 for (p = collstart; p < collend; ++p) {
2744 str += sprintf(str, "&#%d;", (int)*p);
2745 }
2746 p = collend;
2747 break;
2748 default:
2749 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
2750 encoding, reason, startp, size, &exc,
2751 collstart-startp, collend-startp, &newpos);
2752 if (repunicode == NULL)
2753 goto onError;
2754 /* need more space? (at least enough for what we
2755 have+the replacement+the rest of the string, so
2756 we won't have to check space for encodable characters) */
Guido van Rossumf15a29f2007-05-04 00:41:39 +00002757 respos = str - PyBytes_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002758 repsize = PyUnicode_GET_SIZE(repunicode);
2759 requiredsize = respos+repsize+(endp-collend);
2760 if (requiredsize > ressize) {
2761 if (requiredsize<2*ressize)
2762 requiredsize = 2*ressize;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00002763 if (PyBytes_Resize(res, requiredsize)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002764 Py_DECREF(repunicode);
2765 goto onError;
2766 }
Guido van Rossumf15a29f2007-05-04 00:41:39 +00002767 str = PyBytes_AS_STRING(res) + respos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002768 ressize = requiredsize;
2769 }
2770 /* check if there is anything unencodable in the replacement
2771 and copy it to the output */
2772 for (uni2 = PyUnicode_AS_UNICODE(repunicode);repsize-->0; ++uni2, ++str) {
2773 c = *uni2;
2774 if (c >= limit) {
2775 raise_encode_exception(&exc, encoding, startp, size,
2776 unicodepos, unicodepos+1, reason);
2777 Py_DECREF(repunicode);
2778 goto onError;
2779 }
2780 *str = (char)c;
2781 }
2782 p = startp + newpos;
2783 Py_DECREF(repunicode);
2784 }
2785 }
2786 }
2787 /* Resize if we allocated to much */
Guido van Rossumf15a29f2007-05-04 00:41:39 +00002788 respos = str - PyBytes_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002789 if (respos<ressize)
2790 /* If this falls res will be NULL */
Guido van Rossumf15a29f2007-05-04 00:41:39 +00002791 PyBytes_Resize(res, respos);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002792 Py_XDECREF(errorHandler);
2793 Py_XDECREF(exc);
2794 return res;
2795
2796 onError:
2797 Py_XDECREF(res);
2798 Py_XDECREF(errorHandler);
2799 Py_XDECREF(exc);
2800 return NULL;
2801}
2802
Guido van Rossumd57fd912000-03-10 22:53:23 +00002803PyObject *PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002804 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00002805 const char *errors)
2806{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002807 return unicode_encode_ucs1(p, size, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002808}
2809
2810PyObject *PyUnicode_AsLatin1String(PyObject *unicode)
2811{
2812 if (!PyUnicode_Check(unicode)) {
2813 PyErr_BadArgument();
2814 return NULL;
2815 }
2816 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
2817 PyUnicode_GET_SIZE(unicode),
2818 NULL);
2819}
2820
2821/* --- 7-bit ASCII Codec -------------------------------------------------- */
2822
Guido van Rossumd57fd912000-03-10 22:53:23 +00002823PyObject *PyUnicode_DecodeASCII(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002824 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00002825 const char *errors)
2826{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002827 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002828 PyUnicodeObject *v;
2829 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002830 Py_ssize_t startinpos;
2831 Py_ssize_t endinpos;
2832 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002833 const char *e;
2834 PyObject *errorHandler = NULL;
2835 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00002836
Guido van Rossumd57fd912000-03-10 22:53:23 +00002837 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002838 if (size == 1 && *(unsigned char*)s < 128) {
2839 Py_UNICODE r = *(unsigned char*)s;
2840 return PyUnicode_FromUnicode(&r, 1);
2841 }
Tim Petersced69f82003-09-16 20:30:58 +00002842
Guido van Rossumd57fd912000-03-10 22:53:23 +00002843 v = _PyUnicode_New(size);
2844 if (v == NULL)
2845 goto onError;
2846 if (size == 0)
2847 return (PyObject *)v;
2848 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002849 e = s + size;
2850 while (s < e) {
2851 register unsigned char c = (unsigned char)*s;
2852 if (c < 128) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002853 *p++ = c;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002854 ++s;
2855 }
2856 else {
2857 startinpos = s-starts;
2858 endinpos = startinpos + 1;
Jeremy Hyltond8082792003-09-16 19:41:39 +00002859 outpos = p - (Py_UNICODE *)PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002860 if (unicode_decode_call_errorhandler(
2861 errors, &errorHandler,
2862 "ascii", "ordinal not in range(128)",
2863 starts, size, &startinpos, &endinpos, &exc, &s,
2864 (PyObject **)&v, &outpos, &p))
Guido van Rossumd57fd912000-03-10 22:53:23 +00002865 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002866 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002867 }
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002868 if (p - PyUnicode_AS_UNICODE(v) < PyString_GET_SIZE(v))
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002869 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002870 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002871 Py_XDECREF(errorHandler);
2872 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002873 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00002874
Guido van Rossumd57fd912000-03-10 22:53:23 +00002875 onError:
2876 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002877 Py_XDECREF(errorHandler);
2878 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002879 return NULL;
2880}
2881
Guido van Rossumd57fd912000-03-10 22:53:23 +00002882PyObject *PyUnicode_EncodeASCII(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002883 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00002884 const char *errors)
2885{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002886 return unicode_encode_ucs1(p, size, errors, 128);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002887}
2888
2889PyObject *PyUnicode_AsASCIIString(PyObject *unicode)
2890{
2891 if (!PyUnicode_Check(unicode)) {
2892 PyErr_BadArgument();
2893 return NULL;
2894 }
2895 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
2896 PyUnicode_GET_SIZE(unicode),
2897 NULL);
2898}
2899
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00002900#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Guido van Rossum2ea3e142000-03-31 17:24:09 +00002901
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002902/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00002903
Thomas Wouters0e3f5912006-08-11 14:57:12 +00002904#if SIZEOF_INT < SIZEOF_SSIZE_T
2905#define NEED_RETRY
2906#endif
2907
2908/* XXX This code is limited to "true" double-byte encodings, as
2909 a) it assumes an incomplete character consists of a single byte, and
2910 b) IsDBCSLeadByte (probably) does not work for non-DBCS multi-byte
2911 encodings, see IsDBCSLeadByteEx documentation. */
2912
2913static int is_dbcs_lead_byte(const char *s, int offset)
2914{
2915 const char *curr = s + offset;
2916
2917 if (IsDBCSLeadByte(*curr)) {
2918 const char *prev = CharPrev(s, curr);
2919 return (prev == curr) || !IsDBCSLeadByte(*prev) || (curr - prev == 2);
2920 }
2921 return 0;
2922}
2923
2924/*
2925 * Decode MBCS string into unicode object. If 'final' is set, converts
2926 * trailing lead-byte too. Returns consumed size if succeed, -1 otherwise.
2927 */
2928static int decode_mbcs(PyUnicodeObject **v,
2929 const char *s, /* MBCS string */
2930 int size, /* sizeof MBCS string */
2931 int final)
2932{
2933 Py_UNICODE *p;
2934 Py_ssize_t n = 0;
2935 int usize = 0;
2936
2937 assert(size >= 0);
2938
2939 /* Skip trailing lead-byte unless 'final' is set */
2940 if (!final && size >= 1 && is_dbcs_lead_byte(s, size - 1))
2941 --size;
2942
2943 /* First get the size of the result */
2944 if (size > 0) {
2945 usize = MultiByteToWideChar(CP_ACP, 0, s, size, NULL, 0);
2946 if (usize == 0) {
2947 PyErr_SetFromWindowsErrWithFilename(0, NULL);
2948 return -1;
2949 }
2950 }
2951
2952 if (*v == NULL) {
2953 /* Create unicode object */
2954 *v = _PyUnicode_New(usize);
2955 if (*v == NULL)
2956 return -1;
2957 }
2958 else {
2959 /* Extend unicode object */
2960 n = PyUnicode_GET_SIZE(*v);
2961 if (_PyUnicode_Resize(v, n + usize) < 0)
2962 return -1;
2963 }
2964
2965 /* Do the conversion */
2966 if (size > 0) {
2967 p = PyUnicode_AS_UNICODE(*v) + n;
2968 if (0 == MultiByteToWideChar(CP_ACP, 0, s, size, p, usize)) {
2969 PyErr_SetFromWindowsErrWithFilename(0, NULL);
2970 return -1;
2971 }
2972 }
2973
2974 return size;
2975}
2976
2977PyObject *PyUnicode_DecodeMBCSStateful(const char *s,
2978 Py_ssize_t size,
2979 const char *errors,
2980 Py_ssize_t *consumed)
2981{
2982 PyUnicodeObject *v = NULL;
2983 int done;
2984
2985 if (consumed)
2986 *consumed = 0;
2987
2988#ifdef NEED_RETRY
2989 retry:
2990 if (size > INT_MAX)
2991 done = decode_mbcs(&v, s, INT_MAX, 0);
2992 else
2993#endif
2994 done = decode_mbcs(&v, s, (int)size, !consumed);
2995
2996 if (done < 0) {
2997 Py_XDECREF(v);
2998 return NULL;
2999 }
3000
3001 if (consumed)
3002 *consumed += done;
3003
3004#ifdef NEED_RETRY
3005 if (size > INT_MAX) {
3006 s += done;
3007 size -= done;
3008 goto retry;
3009 }
3010#endif
3011
3012 return (PyObject *)v;
3013}
3014
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003015PyObject *PyUnicode_DecodeMBCS(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003016 Py_ssize_t size,
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003017 const char *errors)
3018{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003019 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
3020}
3021
3022/*
3023 * Convert unicode into string object (MBCS).
3024 * Returns 0 if succeed, -1 otherwise.
3025 */
3026static int encode_mbcs(PyObject **repr,
3027 const Py_UNICODE *p, /* unicode */
3028 int size) /* size of unicode */
3029{
3030 int mbcssize = 0;
3031 Py_ssize_t n = 0;
3032
3033 assert(size >= 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003034
3035 /* First get the size of the result */
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003036 if (size > 0) {
3037 mbcssize = WideCharToMultiByte(CP_ACP, 0, p, size, NULL, 0, NULL, NULL);
3038 if (mbcssize == 0) {
3039 PyErr_SetFromWindowsErrWithFilename(0, NULL);
3040 return -1;
3041 }
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003042 }
3043
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003044 if (*repr == NULL) {
3045 /* Create string object */
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003046 *repr = PyBytes_FromStringAndSize(NULL, mbcssize);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003047 if (*repr == NULL)
3048 return -1;
3049 }
3050 else {
3051 /* Extend string object */
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003052 n = PyBytes_Size(*repr);
3053 if (PyBytes_Resize(*repr, n + mbcssize) < 0)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003054 return -1;
3055 }
3056
3057 /* Do the conversion */
3058 if (size > 0) {
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003059 char *s = PyBytes_AS_STRING(*repr) + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003060 if (0 == WideCharToMultiByte(CP_ACP, 0, p, size, s, mbcssize, NULL, NULL)) {
3061 PyErr_SetFromWindowsErrWithFilename(0, NULL);
3062 return -1;
3063 }
3064 }
3065
3066 return 0;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003067}
3068
3069PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003070 Py_ssize_t size,
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003071 const char *errors)
3072{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003073 PyObject *repr = NULL;
3074 int ret;
Guido van Rossum03e29f12000-05-04 15:52:20 +00003075
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003076#ifdef NEED_RETRY
3077 retry:
3078 if (size > INT_MAX)
3079 ret = encode_mbcs(&repr, p, INT_MAX);
3080 else
3081#endif
3082 ret = encode_mbcs(&repr, p, (int)size);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003083
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003084 if (ret < 0) {
3085 Py_XDECREF(repr);
3086 return NULL;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003087 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003088
3089#ifdef NEED_RETRY
3090 if (size > INT_MAX) {
3091 p += INT_MAX;
3092 size -= INT_MAX;
3093 goto retry;
3094 }
3095#endif
3096
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003097 return repr;
3098}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00003099
Mark Hammond0ccda1e2003-07-01 00:13:27 +00003100PyObject *PyUnicode_AsMBCSString(PyObject *unicode)
3101{
3102 if (!PyUnicode_Check(unicode)) {
3103 PyErr_BadArgument();
3104 return NULL;
3105 }
3106 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
3107 PyUnicode_GET_SIZE(unicode),
3108 NULL);
3109}
3110
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003111#undef NEED_RETRY
3112
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00003113#endif /* MS_WINDOWS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003114
Guido van Rossumd57fd912000-03-10 22:53:23 +00003115/* --- Character Mapping Codec -------------------------------------------- */
3116
Guido van Rossumd57fd912000-03-10 22:53:23 +00003117PyObject *PyUnicode_DecodeCharmap(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003118 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003119 PyObject *mapping,
3120 const char *errors)
3121{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003122 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003123 Py_ssize_t startinpos;
3124 Py_ssize_t endinpos;
3125 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003126 const char *e;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003127 PyUnicodeObject *v;
3128 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003129 Py_ssize_t extrachars = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003130 PyObject *errorHandler = NULL;
3131 PyObject *exc = NULL;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003132 Py_UNICODE *mapstring = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003133 Py_ssize_t maplen = 0;
Tim Petersced69f82003-09-16 20:30:58 +00003134
Guido van Rossumd57fd912000-03-10 22:53:23 +00003135 /* Default to Latin-1 */
3136 if (mapping == NULL)
3137 return PyUnicode_DecodeLatin1(s, size, errors);
3138
3139 v = _PyUnicode_New(size);
3140 if (v == NULL)
3141 goto onError;
3142 if (size == 0)
3143 return (PyObject *)v;
3144 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003145 e = s + size;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003146 if (PyUnicode_CheckExact(mapping)) {
3147 mapstring = PyUnicode_AS_UNICODE(mapping);
3148 maplen = PyUnicode_GET_SIZE(mapping);
3149 while (s < e) {
3150 unsigned char ch = *s;
3151 Py_UNICODE x = 0xfffe; /* illegal value */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003152
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003153 if (ch < maplen)
3154 x = mapstring[ch];
Guido van Rossumd57fd912000-03-10 22:53:23 +00003155
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003156 if (x == 0xfffe) {
3157 /* undefined mapping */
3158 outpos = p-PyUnicode_AS_UNICODE(v);
3159 startinpos = s-starts;
3160 endinpos = startinpos+1;
3161 if (unicode_decode_call_errorhandler(
3162 errors, &errorHandler,
3163 "charmap", "character maps to <undefined>",
3164 starts, size, &startinpos, &endinpos, &exc, &s,
3165 (PyObject **)&v, &outpos, &p)) {
3166 goto onError;
Marc-André Lemburgec233e52001-01-06 14:59:58 +00003167 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003168 continue;
Marc-André Lemburgec233e52001-01-06 14:59:58 +00003169 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003170 *p++ = x;
3171 ++s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003172 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003173 }
3174 else {
3175 while (s < e) {
3176 unsigned char ch = *s;
3177 PyObject *w, *x;
3178
3179 /* Get mapping (char ordinal -> integer, Unicode char or None) */
3180 w = PyInt_FromLong((long)ch);
3181 if (w == NULL)
3182 goto onError;
3183 x = PyObject_GetItem(mapping, w);
3184 Py_DECREF(w);
3185 if (x == NULL) {
3186 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
3187 /* No mapping found means: mapping is undefined. */
3188 PyErr_Clear();
3189 x = Py_None;
3190 Py_INCREF(x);
3191 } else
3192 goto onError;
3193 }
3194
3195 /* Apply mapping */
3196 if (PyInt_Check(x)) {
3197 long value = PyInt_AS_LONG(x);
3198 if (value < 0 || value > 65535) {
3199 PyErr_SetString(PyExc_TypeError,
3200 "character mapping must be in range(65536)");
3201 Py_DECREF(x);
3202 goto onError;
3203 }
3204 *p++ = (Py_UNICODE)value;
3205 }
3206 else if (x == Py_None) {
3207 /* undefined mapping */
3208 outpos = p-PyUnicode_AS_UNICODE(v);
3209 startinpos = s-starts;
3210 endinpos = startinpos+1;
3211 if (unicode_decode_call_errorhandler(
3212 errors, &errorHandler,
3213 "charmap", "character maps to <undefined>",
3214 starts, size, &startinpos, &endinpos, &exc, &s,
3215 (PyObject **)&v, &outpos, &p)) {
3216 Py_DECREF(x);
3217 goto onError;
3218 }
Walter Dörwaldd4fff172005-11-28 22:15:56 +00003219 Py_DECREF(x);
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003220 continue;
3221 }
3222 else if (PyUnicode_Check(x)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00003223 Py_ssize_t targetsize = PyUnicode_GET_SIZE(x);
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003224
3225 if (targetsize == 1)
3226 /* 1-1 mapping */
3227 *p++ = *PyUnicode_AS_UNICODE(x);
3228
3229 else if (targetsize > 1) {
3230 /* 1-n mapping */
3231 if (targetsize > extrachars) {
3232 /* resize first */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003233 Py_ssize_t oldpos = p - PyUnicode_AS_UNICODE(v);
3234 Py_ssize_t needed = (targetsize - extrachars) + \
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003235 (targetsize << 2);
3236 extrachars += needed;
Thomas Wouters89f507f2006-12-13 04:49:30 +00003237 /* XXX overflow detection missing */
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003238 if (_PyUnicode_Resize(&v,
3239 PyUnicode_GET_SIZE(v) + needed) < 0) {
3240 Py_DECREF(x);
3241 goto onError;
3242 }
3243 p = PyUnicode_AS_UNICODE(v) + oldpos;
3244 }
3245 Py_UNICODE_COPY(p,
3246 PyUnicode_AS_UNICODE(x),
3247 targetsize);
3248 p += targetsize;
3249 extrachars -= targetsize;
3250 }
3251 /* 1-0 mapping: skip the character */
3252 }
3253 else {
3254 /* wrong return value */
3255 PyErr_SetString(PyExc_TypeError,
3256 "character mapping must return integer, None or unicode");
3257 Py_DECREF(x);
3258 goto onError;
3259 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003260 Py_DECREF(x);
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003261 ++s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003262 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003263 }
3264 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003265 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003266 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003267 Py_XDECREF(errorHandler);
3268 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003269 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00003270
Guido van Rossumd57fd912000-03-10 22:53:23 +00003271 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003272 Py_XDECREF(errorHandler);
3273 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003274 Py_XDECREF(v);
3275 return NULL;
3276}
3277
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003278/* Charmap encoding: the lookup table */
3279
3280struct encoding_map{
3281 PyObject_HEAD
3282 unsigned char level1[32];
3283 int count2, count3;
3284 unsigned char level23[1];
3285};
3286
3287static PyObject*
3288encoding_map_size(PyObject *obj, PyObject* args)
3289{
3290 struct encoding_map *map = (struct encoding_map*)obj;
3291 return PyInt_FromLong(sizeof(*map) - 1 + 16*map->count2 +
3292 128*map->count3);
3293}
3294
3295static PyMethodDef encoding_map_methods[] = {
3296 {"size", encoding_map_size, METH_NOARGS,
3297 PyDoc_STR("Return the size (in bytes) of this object") },
3298 { 0 }
3299};
3300
3301static void
3302encoding_map_dealloc(PyObject* o)
3303{
3304 PyObject_FREE(o);
3305}
3306
3307static PyTypeObject EncodingMapType = {
3308 PyObject_HEAD_INIT(NULL)
3309 0, /*ob_size*/
3310 "EncodingMap", /*tp_name*/
3311 sizeof(struct encoding_map), /*tp_basicsize*/
3312 0, /*tp_itemsize*/
3313 /* methods */
3314 encoding_map_dealloc, /*tp_dealloc*/
3315 0, /*tp_print*/
3316 0, /*tp_getattr*/
3317 0, /*tp_setattr*/
3318 0, /*tp_compare*/
3319 0, /*tp_repr*/
3320 0, /*tp_as_number*/
3321 0, /*tp_as_sequence*/
3322 0, /*tp_as_mapping*/
3323 0, /*tp_hash*/
3324 0, /*tp_call*/
3325 0, /*tp_str*/
3326 0, /*tp_getattro*/
3327 0, /*tp_setattro*/
3328 0, /*tp_as_buffer*/
3329 Py_TPFLAGS_DEFAULT, /*tp_flags*/
3330 0, /*tp_doc*/
3331 0, /*tp_traverse*/
3332 0, /*tp_clear*/
3333 0, /*tp_richcompare*/
3334 0, /*tp_weaklistoffset*/
3335 0, /*tp_iter*/
3336 0, /*tp_iternext*/
3337 encoding_map_methods, /*tp_methods*/
3338 0, /*tp_members*/
3339 0, /*tp_getset*/
3340 0, /*tp_base*/
3341 0, /*tp_dict*/
3342 0, /*tp_descr_get*/
3343 0, /*tp_descr_set*/
3344 0, /*tp_dictoffset*/
3345 0, /*tp_init*/
3346 0, /*tp_alloc*/
3347 0, /*tp_new*/
3348 0, /*tp_free*/
3349 0, /*tp_is_gc*/
3350};
3351
3352PyObject*
3353PyUnicode_BuildEncodingMap(PyObject* string)
3354{
3355 Py_UNICODE *decode;
3356 PyObject *result;
3357 struct encoding_map *mresult;
3358 int i;
3359 int need_dict = 0;
3360 unsigned char level1[32];
3361 unsigned char level2[512];
3362 unsigned char *mlevel1, *mlevel2, *mlevel3;
3363 int count2 = 0, count3 = 0;
3364
3365 if (!PyUnicode_Check(string) || PyUnicode_GetSize(string) != 256) {
3366 PyErr_BadArgument();
3367 return NULL;
3368 }
3369 decode = PyUnicode_AS_UNICODE(string);
3370 memset(level1, 0xFF, sizeof level1);
3371 memset(level2, 0xFF, sizeof level2);
3372
3373 /* If there isn't a one-to-one mapping of NULL to \0,
3374 or if there are non-BMP characters, we need to use
3375 a mapping dictionary. */
3376 if (decode[0] != 0)
3377 need_dict = 1;
3378 for (i = 1; i < 256; i++) {
3379 int l1, l2;
3380 if (decode[i] == 0
3381 #ifdef Py_UNICODE_WIDE
3382 || decode[i] > 0xFFFF
3383 #endif
3384 ) {
3385 need_dict = 1;
3386 break;
3387 }
3388 if (decode[i] == 0xFFFE)
3389 /* unmapped character */
3390 continue;
3391 l1 = decode[i] >> 11;
3392 l2 = decode[i] >> 7;
3393 if (level1[l1] == 0xFF)
3394 level1[l1] = count2++;
3395 if (level2[l2] == 0xFF)
3396 level2[l2] = count3++;
3397 }
3398
3399 if (count2 >= 0xFF || count3 >= 0xFF)
3400 need_dict = 1;
3401
3402 if (need_dict) {
3403 PyObject *result = PyDict_New();
3404 PyObject *key, *value;
3405 if (!result)
3406 return NULL;
3407 for (i = 0; i < 256; i++) {
3408 key = value = NULL;
3409 key = PyInt_FromLong(decode[i]);
3410 value = PyInt_FromLong(i);
3411 if (!key || !value)
3412 goto failed1;
3413 if (PyDict_SetItem(result, key, value) == -1)
3414 goto failed1;
3415 Py_DECREF(key);
3416 Py_DECREF(value);
3417 }
3418 return result;
3419 failed1:
3420 Py_XDECREF(key);
3421 Py_XDECREF(value);
3422 Py_DECREF(result);
3423 return NULL;
3424 }
3425
3426 /* Create a three-level trie */
3427 result = PyObject_MALLOC(sizeof(struct encoding_map) +
3428 16*count2 + 128*count3 - 1);
3429 if (!result)
3430 return PyErr_NoMemory();
3431 PyObject_Init(result, &EncodingMapType);
3432 mresult = (struct encoding_map*)result;
3433 mresult->count2 = count2;
3434 mresult->count3 = count3;
3435 mlevel1 = mresult->level1;
3436 mlevel2 = mresult->level23;
3437 mlevel3 = mresult->level23 + 16*count2;
3438 memcpy(mlevel1, level1, 32);
3439 memset(mlevel2, 0xFF, 16*count2);
3440 memset(mlevel3, 0, 128*count3);
3441 count3 = 0;
3442 for (i = 1; i < 256; i++) {
3443 int o1, o2, o3, i2, i3;
3444 if (decode[i] == 0xFFFE)
3445 /* unmapped character */
3446 continue;
3447 o1 = decode[i]>>11;
3448 o2 = (decode[i]>>7) & 0xF;
3449 i2 = 16*mlevel1[o1] + o2;
3450 if (mlevel2[i2] == 0xFF)
3451 mlevel2[i2] = count3++;
3452 o3 = decode[i] & 0x7F;
3453 i3 = 128*mlevel2[i2] + o3;
3454 mlevel3[i3] = i;
3455 }
3456 return result;
3457}
3458
3459static int
3460encoding_map_lookup(Py_UNICODE c, PyObject *mapping)
3461{
3462 struct encoding_map *map = (struct encoding_map*)mapping;
3463 int l1 = c>>11;
3464 int l2 = (c>>7) & 0xF;
3465 int l3 = c & 0x7F;
3466 int i;
3467
3468#ifdef Py_UNICODE_WIDE
3469 if (c > 0xFFFF) {
3470 return -1;
3471 }
3472#endif
3473 if (c == 0)
3474 return 0;
3475 /* level 1*/
3476 i = map->level1[l1];
3477 if (i == 0xFF) {
3478 return -1;
3479 }
3480 /* level 2*/
3481 i = map->level23[16*i+l2];
3482 if (i == 0xFF) {
3483 return -1;
3484 }
3485 /* level 3 */
3486 i = map->level23[16*map->count2 + 128*i + l3];
3487 if (i == 0) {
3488 return -1;
3489 }
3490 return i;
3491}
3492
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003493/* Lookup the character ch in the mapping. If the character
3494 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00003495 error occurred). */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003496static PyObject *charmapencode_lookup(Py_UNICODE c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003497{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003498 PyObject *w = PyInt_FromLong((long)c);
3499 PyObject *x;
3500
3501 if (w == NULL)
3502 return NULL;
3503 x = PyObject_GetItem(mapping, w);
3504 Py_DECREF(w);
3505 if (x == NULL) {
3506 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
3507 /* No mapping found means: mapping is undefined. */
3508 PyErr_Clear();
3509 x = Py_None;
3510 Py_INCREF(x);
3511 return x;
3512 } else
3513 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003514 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00003515 else if (x == Py_None)
3516 return x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003517 else if (PyInt_Check(x)) {
3518 long value = PyInt_AS_LONG(x);
3519 if (value < 0 || value > 255) {
3520 PyErr_SetString(PyExc_TypeError,
3521 "character mapping must be in range(256)");
3522 Py_DECREF(x);
3523 return NULL;
3524 }
3525 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003526 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003527 else if (PyString_Check(x))
3528 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003529 else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003530 /* wrong return value */
3531 PyErr_SetString(PyExc_TypeError,
3532 "character mapping must return integer, None or str");
3533 Py_DECREF(x);
3534 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003535 }
3536}
3537
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003538static int
3539charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
3540{
3541 Py_ssize_t outsize = PyString_GET_SIZE(*outobj);
3542 /* exponentially overallocate to minimize reallocations */
3543 if (requiredsize < 2*outsize)
3544 requiredsize = 2*outsize;
3545 if (_PyString_Resize(outobj, requiredsize)) {
3546 return 0;
3547 }
3548 return 1;
3549}
3550
3551typedef enum charmapencode_result {
3552 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
3553}charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003554/* lookup the character, put the result in the output string and adjust
3555 various state variables. Reallocate the output string if not enough
3556 space is available. Return a new reference to the object that
3557 was put in the output buffer, or Py_None, if the mapping was undefined
3558 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00003559 reallocation error occurred. The caller must decref the result */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003560static
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003561charmapencode_result charmapencode_output(Py_UNICODE c, PyObject *mapping,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003562 PyObject **outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003563{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003564 PyObject *rep;
3565 char *outstart;
3566 Py_ssize_t outsize = PyString_GET_SIZE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003567
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003568 if (mapping->ob_type == &EncodingMapType) {
3569 int res = encoding_map_lookup(c, mapping);
3570 Py_ssize_t requiredsize = *outpos+1;
3571 if (res == -1)
3572 return enc_FAILED;
3573 if (outsize<requiredsize)
3574 if (!charmapencode_resize(outobj, outpos, requiredsize))
3575 return enc_EXCEPTION;
3576 outstart = PyString_AS_STRING(*outobj);
3577 outstart[(*outpos)++] = (char)res;
3578 return enc_SUCCESS;
3579 }
3580
3581 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003582 if (rep==NULL)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003583 return enc_EXCEPTION;
3584 else if (rep==Py_None) {
3585 Py_DECREF(rep);
3586 return enc_FAILED;
3587 } else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003588 if (PyInt_Check(rep)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00003589 Py_ssize_t requiredsize = *outpos+1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003590 if (outsize<requiredsize)
3591 if (!charmapencode_resize(outobj, outpos, requiredsize)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003592 Py_DECREF(rep);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003593 return enc_EXCEPTION;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003594 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003595 outstart = PyString_AS_STRING(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003596 outstart[(*outpos)++] = (char)PyInt_AS_LONG(rep);
3597 }
3598 else {
3599 const char *repchars = PyString_AS_STRING(rep);
Martin v. Löwis18e16552006-02-15 17:27:45 +00003600 Py_ssize_t repsize = PyString_GET_SIZE(rep);
3601 Py_ssize_t requiredsize = *outpos+repsize;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003602 if (outsize<requiredsize)
3603 if (!charmapencode_resize(outobj, outpos, requiredsize)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003604 Py_DECREF(rep);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003605 return enc_EXCEPTION;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003606 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003607 outstart = PyString_AS_STRING(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003608 memcpy(outstart + *outpos, repchars, repsize);
3609 *outpos += repsize;
3610 }
3611 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003612 Py_DECREF(rep);
3613 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003614}
3615
3616/* handle an error in PyUnicode_EncodeCharmap
3617 Return 0 on success, -1 on error */
3618static
3619int charmap_encoding_error(
Martin v. Löwis18e16552006-02-15 17:27:45 +00003620 const Py_UNICODE *p, Py_ssize_t size, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003621 PyObject **exceptionObject,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00003622 int *known_errorHandler, PyObject **errorHandler, const char *errors,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003623 PyObject **res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003624{
3625 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003626 Py_ssize_t repsize;
3627 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003628 Py_UNICODE *uni2;
3629 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003630 Py_ssize_t collstartpos = *inpos;
3631 Py_ssize_t collendpos = *inpos+1;
3632 Py_ssize_t collpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003633 char *encoding = "charmap";
3634 char *reason = "character maps to <undefined>";
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003635 charmapencode_result x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003636
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003637 /* find all unencodable characters */
3638 while (collendpos < size) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003639 PyObject *rep;
3640 if (mapping->ob_type == &EncodingMapType) {
3641 int res = encoding_map_lookup(p[collendpos], mapping);
3642 if (res != -1)
3643 break;
3644 ++collendpos;
3645 continue;
3646 }
3647
3648 rep = charmapencode_lookup(p[collendpos], mapping);
3649 if (rep==NULL)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003650 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003651 else if (rep!=Py_None) {
3652 Py_DECREF(rep);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003653 break;
3654 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003655 Py_DECREF(rep);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003656 ++collendpos;
3657 }
3658 /* cache callback name lookup
3659 * (if not done yet, i.e. it's the first error) */
3660 if (*known_errorHandler==-1) {
3661 if ((errors==NULL) || (!strcmp(errors, "strict")))
3662 *known_errorHandler = 1;
3663 else if (!strcmp(errors, "replace"))
3664 *known_errorHandler = 2;
3665 else if (!strcmp(errors, "ignore"))
3666 *known_errorHandler = 3;
3667 else if (!strcmp(errors, "xmlcharrefreplace"))
3668 *known_errorHandler = 4;
3669 else
3670 *known_errorHandler = 0;
3671 }
3672 switch (*known_errorHandler) {
3673 case 1: /* strict */
3674 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
3675 return -1;
3676 case 2: /* replace */
3677 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
3678 x = charmapencode_output('?', mapping, res, respos);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003679 if (x==enc_EXCEPTION) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003680 return -1;
3681 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003682 else if (x==enc_FAILED) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003683 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
3684 return -1;
3685 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003686 }
3687 /* fall through */
3688 case 3: /* ignore */
3689 *inpos = collendpos;
3690 break;
3691 case 4: /* xmlcharrefreplace */
3692 /* generate replacement (temporarily (mis)uses p) */
3693 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
3694 char buffer[2+29+1+1];
3695 char *cp;
3696 sprintf(buffer, "&#%d;", (int)p[collpos]);
3697 for (cp = buffer; *cp; ++cp) {
3698 x = charmapencode_output(*cp, mapping, res, respos);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003699 if (x==enc_EXCEPTION)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003700 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003701 else if (x==enc_FAILED) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003702 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
3703 return -1;
3704 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003705 }
3706 }
3707 *inpos = collendpos;
3708 break;
3709 default:
Walter Dörwalde5402fb2003-08-14 20:25:29 +00003710 repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003711 encoding, reason, p, size, exceptionObject,
3712 collstartpos, collendpos, &newpos);
3713 if (repunicode == NULL)
3714 return -1;
3715 /* generate replacement */
3716 repsize = PyUnicode_GET_SIZE(repunicode);
3717 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
3718 x = charmapencode_output(*uni2, mapping, res, respos);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003719 if (x==enc_EXCEPTION) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003720 return -1;
3721 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003722 else if (x==enc_FAILED) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003723 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003724 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
3725 return -1;
3726 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003727 }
3728 *inpos = newpos;
3729 Py_DECREF(repunicode);
3730 }
3731 return 0;
3732}
3733
Guido van Rossumd57fd912000-03-10 22:53:23 +00003734PyObject *PyUnicode_EncodeCharmap(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003735 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003736 PyObject *mapping,
3737 const char *errors)
3738{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003739 /* output object */
3740 PyObject *res = NULL;
3741 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003742 Py_ssize_t inpos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003743 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003744 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003745 PyObject *errorHandler = NULL;
3746 PyObject *exc = NULL;
3747 /* the following variable is used for caching string comparisons
3748 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
3749 * 3=ignore, 4=xmlcharrefreplace */
3750 int known_errorHandler = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003751
3752 /* Default to Latin-1 */
3753 if (mapping == NULL)
3754 return PyUnicode_EncodeLatin1(p, size, errors);
3755
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003756 /* allocate enough for a simple encoding without
3757 replacements, if we need more, we'll resize */
3758 res = PyString_FromStringAndSize(NULL, size);
3759 if (res == NULL)
3760 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00003761 if (size == 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003762 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003763
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003764 while (inpos<size) {
3765 /* try to encode it */
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003766 charmapencode_result x = charmapencode_output(p[inpos], mapping, &res, &respos);
3767 if (x==enc_EXCEPTION) /* error */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003768 goto onError;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003769 if (x==enc_FAILED) { /* unencodable character */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003770 if (charmap_encoding_error(p, size, &inpos, mapping,
3771 &exc,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00003772 &known_errorHandler, &errorHandler, errors,
Walter Dörwald9b30f202003-08-15 16:26:34 +00003773 &res, &respos)) {
Marc-André Lemburg3a645e42001-01-16 11:54:12 +00003774 goto onError;
Walter Dörwald9b30f202003-08-15 16:26:34 +00003775 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003776 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003777 else
3778 /* done with this character => adjust input position */
3779 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003780 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003781
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003782 /* Resize if we allocated to much */
3783 if (respos<PyString_GET_SIZE(res)) {
3784 if (_PyString_Resize(&res, respos))
3785 goto onError;
3786 }
3787 Py_XDECREF(exc);
3788 Py_XDECREF(errorHandler);
3789 return res;
3790
3791 onError:
3792 Py_XDECREF(res);
3793 Py_XDECREF(exc);
3794 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003795 return NULL;
3796}
3797
3798PyObject *PyUnicode_AsCharmapString(PyObject *unicode,
3799 PyObject *mapping)
3800{
3801 if (!PyUnicode_Check(unicode) || mapping == NULL) {
3802 PyErr_BadArgument();
3803 return NULL;
3804 }
3805 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
3806 PyUnicode_GET_SIZE(unicode),
3807 mapping,
3808 NULL);
3809}
3810
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003811/* create or adjust a UnicodeTranslateError */
3812static void make_translate_exception(PyObject **exceptionObject,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003813 const Py_UNICODE *unicode, Py_ssize_t size,
3814 Py_ssize_t startpos, Py_ssize_t endpos,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003815 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003816{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003817 if (*exceptionObject == NULL) {
3818 *exceptionObject = PyUnicodeTranslateError_Create(
3819 unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003820 }
3821 else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003822 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
3823 goto onError;
3824 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
3825 goto onError;
3826 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
3827 goto onError;
3828 return;
3829 onError:
3830 Py_DECREF(*exceptionObject);
3831 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003832 }
3833}
3834
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003835/* raises a UnicodeTranslateError */
3836static void raise_translate_exception(PyObject **exceptionObject,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003837 const Py_UNICODE *unicode, Py_ssize_t size,
3838 Py_ssize_t startpos, Py_ssize_t endpos,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003839 const char *reason)
3840{
3841 make_translate_exception(exceptionObject,
3842 unicode, size, startpos, endpos, reason);
3843 if (*exceptionObject != NULL)
3844 PyCodec_StrictErrors(*exceptionObject);
3845}
3846
3847/* error handling callback helper:
3848 build arguments, call the callback and check the arguments,
3849 put the result into newpos and return the replacement string, which
3850 has to be freed by the caller */
3851static PyObject *unicode_translate_call_errorhandler(const char *errors,
3852 PyObject **errorHandler,
3853 const char *reason,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003854 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
3855 Py_ssize_t startpos, Py_ssize_t endpos,
3856 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003857{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003858 static char *argparse = "O!n;translating error handler must return (unicode, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003859
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003860 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003861 PyObject *restuple;
3862 PyObject *resunicode;
3863
3864 if (*errorHandler == NULL) {
3865 *errorHandler = PyCodec_LookupError(errors);
3866 if (*errorHandler == NULL)
3867 return NULL;
3868 }
3869
3870 make_translate_exception(exceptionObject,
3871 unicode, size, startpos, endpos, reason);
3872 if (*exceptionObject == NULL)
3873 return NULL;
3874
3875 restuple = PyObject_CallFunctionObjArgs(
3876 *errorHandler, *exceptionObject, NULL);
3877 if (restuple == NULL)
3878 return NULL;
3879 if (!PyTuple_Check(restuple)) {
3880 PyErr_Format(PyExc_TypeError, &argparse[4]);
3881 Py_DECREF(restuple);
3882 return NULL;
3883 }
3884 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003885 &resunicode, &i_newpos)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003886 Py_DECREF(restuple);
3887 return NULL;
3888 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00003889 if (i_newpos<0)
3890 *newpos = size+i_newpos;
3891 else
3892 *newpos = i_newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003893 if (*newpos<0 || *newpos>size) {
Martin v. Löwis2c95cc62006-02-16 06:54:25 +00003894 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003895 Py_DECREF(restuple);
3896 return NULL;
3897 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003898 Py_INCREF(resunicode);
3899 Py_DECREF(restuple);
3900 return resunicode;
3901}
3902
3903/* Lookup the character ch in the mapping and put the result in result,
3904 which must be decrefed by the caller.
3905 Return 0 on success, -1 on error */
3906static
3907int charmaptranslate_lookup(Py_UNICODE c, PyObject *mapping, PyObject **result)
3908{
3909 PyObject *w = PyInt_FromLong((long)c);
3910 PyObject *x;
3911
3912 if (w == NULL)
3913 return -1;
3914 x = PyObject_GetItem(mapping, w);
3915 Py_DECREF(w);
3916 if (x == NULL) {
3917 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
3918 /* No mapping found means: use 1:1 mapping. */
3919 PyErr_Clear();
3920 *result = NULL;
3921 return 0;
3922 } else
3923 return -1;
3924 }
3925 else if (x == Py_None) {
3926 *result = x;
3927 return 0;
3928 }
3929 else if (PyInt_Check(x)) {
3930 long value = PyInt_AS_LONG(x);
3931 long max = PyUnicode_GetMax();
3932 if (value < 0 || value > max) {
3933 PyErr_Format(PyExc_TypeError,
3934 "character mapping must be in range(0x%lx)", max+1);
3935 Py_DECREF(x);
3936 return -1;
3937 }
3938 *result = x;
3939 return 0;
3940 }
3941 else if (PyUnicode_Check(x)) {
3942 *result = x;
3943 return 0;
3944 }
3945 else {
3946 /* wrong return value */
3947 PyErr_SetString(PyExc_TypeError,
3948 "character mapping must return integer, None or unicode");
Walter Dörwald150523e2003-08-15 16:52:19 +00003949 Py_DECREF(x);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003950 return -1;
3951 }
3952}
3953/* ensure that *outobj is at least requiredsize characters long,
3954if not reallocate and adjust various state variables.
3955Return 0 on success, -1 on error */
3956static
Walter Dörwald4894c302003-10-24 14:25:28 +00003957int charmaptranslate_makespace(PyObject **outobj, Py_UNICODE **outp,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003958 Py_ssize_t requiredsize)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003959{
Martin v. Löwis18e16552006-02-15 17:27:45 +00003960 Py_ssize_t oldsize = PyUnicode_GET_SIZE(*outobj);
Walter Dörwald4894c302003-10-24 14:25:28 +00003961 if (requiredsize > oldsize) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003962 /* remember old output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003963 Py_ssize_t outpos = *outp-PyUnicode_AS_UNICODE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003964 /* exponentially overallocate to minimize reallocations */
Walter Dörwald4894c302003-10-24 14:25:28 +00003965 if (requiredsize < 2 * oldsize)
3966 requiredsize = 2 * oldsize;
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00003967 if (_PyUnicode_Resize(outobj, requiredsize) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003968 return -1;
3969 *outp = PyUnicode_AS_UNICODE(*outobj) + outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003970 }
3971 return 0;
3972}
3973/* lookup the character, put the result in the output string and adjust
3974 various state variables. Return a new reference to the object that
3975 was put in the output buffer in *result, or Py_None, if the mapping was
3976 undefined (in which case no character was written).
3977 The called must decref result.
3978 Return 0 on success, -1 on error. */
3979static
Walter Dörwald4894c302003-10-24 14:25:28 +00003980int charmaptranslate_output(const Py_UNICODE *startinp, const Py_UNICODE *curinp,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003981 Py_ssize_t insize, PyObject *mapping, PyObject **outobj, Py_UNICODE **outp,
Walter Dörwald4894c302003-10-24 14:25:28 +00003982 PyObject **res)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003983{
Walter Dörwald4894c302003-10-24 14:25:28 +00003984 if (charmaptranslate_lookup(*curinp, mapping, res))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003985 return -1;
3986 if (*res==NULL) {
3987 /* not found => default to 1:1 mapping */
Walter Dörwald4894c302003-10-24 14:25:28 +00003988 *(*outp)++ = *curinp;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003989 }
3990 else if (*res==Py_None)
3991 ;
3992 else if (PyInt_Check(*res)) {
3993 /* no overflow check, because we know that the space is enough */
3994 *(*outp)++ = (Py_UNICODE)PyInt_AS_LONG(*res);
3995 }
3996 else if (PyUnicode_Check(*res)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00003997 Py_ssize_t repsize = PyUnicode_GET_SIZE(*res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003998 if (repsize==1) {
3999 /* no overflow check, because we know that the space is enough */
4000 *(*outp)++ = *PyUnicode_AS_UNICODE(*res);
4001 }
4002 else if (repsize!=0) {
4003 /* more than one character */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004004 Py_ssize_t requiredsize = (*outp-PyUnicode_AS_UNICODE(*outobj)) +
Walter Dörwaldcd736e72004-02-05 17:36:00 +00004005 (insize - (curinp-startinp)) +
Walter Dörwald4894c302003-10-24 14:25:28 +00004006 repsize - 1;
4007 if (charmaptranslate_makespace(outobj, outp, requiredsize))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004008 return -1;
4009 memcpy(*outp, PyUnicode_AS_UNICODE(*res), sizeof(Py_UNICODE)*repsize);
4010 *outp += repsize;
4011 }
4012 }
4013 else
4014 return -1;
4015 return 0;
4016}
4017
4018PyObject *PyUnicode_TranslateCharmap(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004019 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004020 PyObject *mapping,
4021 const char *errors)
4022{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004023 /* output object */
4024 PyObject *res = NULL;
4025 /* pointers to the beginning and end+1 of input */
4026 const Py_UNICODE *startp = p;
4027 const Py_UNICODE *endp = p + size;
4028 /* pointer into the output */
4029 Py_UNICODE *str;
4030 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004031 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004032 char *reason = "character maps to <undefined>";
4033 PyObject *errorHandler = NULL;
4034 PyObject *exc = NULL;
4035 /* the following variable is used for caching string comparisons
4036 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
4037 * 3=ignore, 4=xmlcharrefreplace */
4038 int known_errorHandler = -1;
4039
Guido van Rossumd57fd912000-03-10 22:53:23 +00004040 if (mapping == NULL) {
4041 PyErr_BadArgument();
4042 return NULL;
4043 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004044
4045 /* allocate enough for a simple 1:1 translation without
4046 replacements, if we need more, we'll resize */
4047 res = PyUnicode_FromUnicode(NULL, size);
4048 if (res == NULL)
Walter Dörwald4894c302003-10-24 14:25:28 +00004049 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004050 if (size == 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004051 return res;
4052 str = PyUnicode_AS_UNICODE(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004053
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004054 while (p<endp) {
4055 /* try to encode it */
4056 PyObject *x = NULL;
Walter Dörwald4894c302003-10-24 14:25:28 +00004057 if (charmaptranslate_output(startp, p, size, mapping, &res, &str, &x)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004058 Py_XDECREF(x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004059 goto onError;
4060 }
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00004061 Py_XDECREF(x);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004062 if (x!=Py_None) /* it worked => adjust input pointer */
4063 ++p;
4064 else { /* untranslatable character */
4065 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004066 Py_ssize_t repsize;
4067 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004068 Py_UNICODE *uni2;
4069 /* startpos for collecting untranslatable chars */
4070 const Py_UNICODE *collstart = p;
4071 const Py_UNICODE *collend = p+1;
4072 const Py_UNICODE *coll;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004073
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004074 /* find all untranslatable characters */
4075 while (collend < endp) {
Walter Dörwald4894c302003-10-24 14:25:28 +00004076 if (charmaptranslate_lookup(*collend, mapping, &x))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004077 goto onError;
4078 Py_XDECREF(x);
4079 if (x!=Py_None)
4080 break;
4081 ++collend;
4082 }
4083 /* cache callback name lookup
4084 * (if not done yet, i.e. it's the first error) */
4085 if (known_errorHandler==-1) {
4086 if ((errors==NULL) || (!strcmp(errors, "strict")))
4087 known_errorHandler = 1;
4088 else if (!strcmp(errors, "replace"))
4089 known_errorHandler = 2;
4090 else if (!strcmp(errors, "ignore"))
4091 known_errorHandler = 3;
4092 else if (!strcmp(errors, "xmlcharrefreplace"))
4093 known_errorHandler = 4;
4094 else
4095 known_errorHandler = 0;
4096 }
4097 switch (known_errorHandler) {
4098 case 1: /* strict */
4099 raise_translate_exception(&exc, startp, size, collstart-startp, collend-startp, reason);
4100 goto onError;
4101 case 2: /* replace */
4102 /* No need to check for space, this is a 1:1 replacement */
4103 for (coll = collstart; coll<collend; ++coll)
4104 *str++ = '?';
4105 /* fall through */
4106 case 3: /* ignore */
4107 p = collend;
4108 break;
4109 case 4: /* xmlcharrefreplace */
4110 /* generate replacement (temporarily (mis)uses p) */
4111 for (p = collstart; p < collend; ++p) {
4112 char buffer[2+29+1+1];
4113 char *cp;
4114 sprintf(buffer, "&#%d;", (int)*p);
Walter Dörwald4894c302003-10-24 14:25:28 +00004115 if (charmaptranslate_makespace(&res, &str,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004116 (str-PyUnicode_AS_UNICODE(res))+strlen(buffer)+(endp-collend)))
4117 goto onError;
4118 for (cp = buffer; *cp; ++cp)
4119 *str++ = *cp;
4120 }
4121 p = collend;
4122 break;
4123 default:
4124 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
4125 reason, startp, size, &exc,
4126 collstart-startp, collend-startp, &newpos);
4127 if (repunicode == NULL)
4128 goto onError;
4129 /* generate replacement */
4130 repsize = PyUnicode_GET_SIZE(repunicode);
Walter Dörwald4894c302003-10-24 14:25:28 +00004131 if (charmaptranslate_makespace(&res, &str,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004132 (str-PyUnicode_AS_UNICODE(res))+repsize+(endp-collend))) {
4133 Py_DECREF(repunicode);
4134 goto onError;
4135 }
4136 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2)
4137 *str++ = *uni2;
4138 p = startp + newpos;
4139 Py_DECREF(repunicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004140 }
4141 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004142 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004143 /* Resize if we allocated to much */
4144 respos = str-PyUnicode_AS_UNICODE(res);
Walter Dörwald4894c302003-10-24 14:25:28 +00004145 if (respos<PyUnicode_GET_SIZE(res)) {
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00004146 if (_PyUnicode_Resize(&res, respos) < 0)
Guido van Rossumfd4b9572000-04-10 13:51:10 +00004147 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004148 }
4149 Py_XDECREF(exc);
4150 Py_XDECREF(errorHandler);
4151 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004152
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004153 onError:
4154 Py_XDECREF(res);
4155 Py_XDECREF(exc);
4156 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004157 return NULL;
4158}
4159
4160PyObject *PyUnicode_Translate(PyObject *str,
4161 PyObject *mapping,
4162 const char *errors)
4163{
4164 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00004165
Guido van Rossumd57fd912000-03-10 22:53:23 +00004166 str = PyUnicode_FromObject(str);
4167 if (str == NULL)
4168 goto onError;
4169 result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str),
4170 PyUnicode_GET_SIZE(str),
4171 mapping,
4172 errors);
4173 Py_DECREF(str);
4174 return result;
Tim Petersced69f82003-09-16 20:30:58 +00004175
Guido van Rossumd57fd912000-03-10 22:53:23 +00004176 onError:
4177 Py_XDECREF(str);
4178 return NULL;
4179}
Tim Petersced69f82003-09-16 20:30:58 +00004180
Guido van Rossum9e896b32000-04-05 20:11:21 +00004181/* --- Decimal Encoder ---------------------------------------------------- */
4182
4183int PyUnicode_EncodeDecimal(Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004184 Py_ssize_t length,
Guido van Rossum9e896b32000-04-05 20:11:21 +00004185 char *output,
4186 const char *errors)
4187{
4188 Py_UNICODE *p, *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004189 PyObject *errorHandler = NULL;
4190 PyObject *exc = NULL;
4191 const char *encoding = "decimal";
4192 const char *reason = "invalid decimal Unicode string";
4193 /* the following variable is used for caching string comparisons
4194 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
4195 int known_errorHandler = -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00004196
4197 if (output == NULL) {
4198 PyErr_BadArgument();
4199 return -1;
4200 }
4201
4202 p = s;
4203 end = s + length;
4204 while (p < end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004205 register Py_UNICODE ch = *p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00004206 int decimal;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004207 PyObject *repunicode;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004208 Py_ssize_t repsize;
4209 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004210 Py_UNICODE *uni2;
4211 Py_UNICODE *collstart;
4212 Py_UNICODE *collend;
Tim Petersced69f82003-09-16 20:30:58 +00004213
Guido van Rossum9e896b32000-04-05 20:11:21 +00004214 if (Py_UNICODE_ISSPACE(ch)) {
4215 *output++ = ' ';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004216 ++p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00004217 continue;
4218 }
4219 decimal = Py_UNICODE_TODECIMAL(ch);
4220 if (decimal >= 0) {
4221 *output++ = '0' + decimal;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004222 ++p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00004223 continue;
4224 }
Guido van Rossumba477042000-04-06 18:18:10 +00004225 if (0 < ch && ch < 256) {
Guido van Rossum42c29aa2000-05-03 23:58:29 +00004226 *output++ = (char)ch;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004227 ++p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00004228 continue;
4229 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004230 /* All other characters are considered unencodable */
4231 collstart = p;
4232 collend = p+1;
4233 while (collend < end) {
4234 if ((0 < *collend && *collend < 256) ||
4235 !Py_UNICODE_ISSPACE(*collend) ||
4236 Py_UNICODE_TODECIMAL(*collend))
4237 break;
Guido van Rossum9e896b32000-04-05 20:11:21 +00004238 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004239 /* cache callback name lookup
4240 * (if not done yet, i.e. it's the first error) */
4241 if (known_errorHandler==-1) {
4242 if ((errors==NULL) || (!strcmp(errors, "strict")))
4243 known_errorHandler = 1;
4244 else if (!strcmp(errors, "replace"))
4245 known_errorHandler = 2;
4246 else if (!strcmp(errors, "ignore"))
4247 known_errorHandler = 3;
4248 else if (!strcmp(errors, "xmlcharrefreplace"))
4249 known_errorHandler = 4;
4250 else
4251 known_errorHandler = 0;
4252 }
4253 switch (known_errorHandler) {
4254 case 1: /* strict */
4255 raise_encode_exception(&exc, encoding, s, length, collstart-s, collend-s, reason);
4256 goto onError;
4257 case 2: /* replace */
4258 for (p = collstart; p < collend; ++p)
4259 *output++ = '?';
4260 /* fall through */
4261 case 3: /* ignore */
4262 p = collend;
4263 break;
4264 case 4: /* xmlcharrefreplace */
4265 /* generate replacement (temporarily (mis)uses p) */
4266 for (p = collstart; p < collend; ++p)
4267 output += sprintf(output, "&#%d;", (int)*p);
4268 p = collend;
4269 break;
4270 default:
4271 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
4272 encoding, reason, s, length, &exc,
4273 collstart-s, collend-s, &newpos);
4274 if (repunicode == NULL)
4275 goto onError;
4276 /* generate replacement */
4277 repsize = PyUnicode_GET_SIZE(repunicode);
4278 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
4279 Py_UNICODE ch = *uni2;
4280 if (Py_UNICODE_ISSPACE(ch))
4281 *output++ = ' ';
4282 else {
4283 decimal = Py_UNICODE_TODECIMAL(ch);
4284 if (decimal >= 0)
4285 *output++ = '0' + decimal;
4286 else if (0 < ch && ch < 256)
4287 *output++ = (char)ch;
4288 else {
4289 Py_DECREF(repunicode);
4290 raise_encode_exception(&exc, encoding,
4291 s, length, collstart-s, collend-s, reason);
4292 goto onError;
4293 }
4294 }
4295 }
4296 p = s + newpos;
4297 Py_DECREF(repunicode);
Guido van Rossum9e896b32000-04-05 20:11:21 +00004298 }
4299 }
4300 /* 0-terminate the output string */
4301 *output++ = '\0';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004302 Py_XDECREF(exc);
4303 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00004304 return 0;
4305
4306 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004307 Py_XDECREF(exc);
4308 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00004309 return -1;
4310}
4311
Guido van Rossumd57fd912000-03-10 22:53:23 +00004312/* --- Helpers ------------------------------------------------------------ */
4313
Thomas Wouters477c8d52006-05-27 19:21:47 +00004314#define STRINGLIB_CHAR Py_UNICODE
4315
4316#define STRINGLIB_LEN PyUnicode_GET_SIZE
4317#define STRINGLIB_NEW PyUnicode_FromUnicode
4318#define STRINGLIB_STR PyUnicode_AS_UNICODE
4319
4320Py_LOCAL_INLINE(int)
4321STRINGLIB_CMP(const Py_UNICODE* str, const Py_UNICODE* other, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004322{
Thomas Wouters477c8d52006-05-27 19:21:47 +00004323 if (str[0] != other[0])
4324 return 1;
4325 return memcmp((void*) str, (void*) other, len * sizeof(Py_UNICODE));
Guido van Rossumd57fd912000-03-10 22:53:23 +00004326}
4327
Thomas Wouters477c8d52006-05-27 19:21:47 +00004328#define STRINGLIB_EMPTY unicode_empty
4329
4330#include "stringlib/fastsearch.h"
4331
4332#include "stringlib/count.h"
4333#include "stringlib/find.h"
4334#include "stringlib/partition.h"
4335
4336/* helper macro to fixup start/end slice values */
4337#define FIX_START_END(obj) \
4338 if (start < 0) \
4339 start += (obj)->length; \
4340 if (start < 0) \
4341 start = 0; \
4342 if (end > (obj)->length) \
4343 end = (obj)->length; \
4344 if (end < 0) \
4345 end += (obj)->length; \
4346 if (end < 0) \
4347 end = 0;
4348
Martin v. Löwis18e16552006-02-15 17:27:45 +00004349Py_ssize_t PyUnicode_Count(PyObject *str,
Thomas Wouters477c8d52006-05-27 19:21:47 +00004350 PyObject *substr,
4351 Py_ssize_t start,
4352 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004353{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004354 Py_ssize_t result;
Thomas Wouters477c8d52006-05-27 19:21:47 +00004355 PyUnicodeObject* str_obj;
4356 PyUnicodeObject* sub_obj;
Tim Petersced69f82003-09-16 20:30:58 +00004357
Thomas Wouters477c8d52006-05-27 19:21:47 +00004358 str_obj = (PyUnicodeObject*) PyUnicode_FromObject(str);
4359 if (!str_obj)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004360 return -1;
Thomas Wouters477c8d52006-05-27 19:21:47 +00004361 sub_obj = (PyUnicodeObject*) PyUnicode_FromObject(substr);
4362 if (!sub_obj) {
4363 Py_DECREF(str_obj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004364 return -1;
4365 }
Tim Petersced69f82003-09-16 20:30:58 +00004366
Thomas Wouters477c8d52006-05-27 19:21:47 +00004367 FIX_START_END(str_obj);
Tim Petersced69f82003-09-16 20:30:58 +00004368
Thomas Wouters477c8d52006-05-27 19:21:47 +00004369 result = stringlib_count(
4370 str_obj->str + start, end - start, sub_obj->str, sub_obj->length
4371 );
4372
4373 Py_DECREF(sub_obj);
4374 Py_DECREF(str_obj);
4375
Guido van Rossumd57fd912000-03-10 22:53:23 +00004376 return result;
4377}
4378
Martin v. Löwis18e16552006-02-15 17:27:45 +00004379Py_ssize_t PyUnicode_Find(PyObject *str,
Thomas Wouters477c8d52006-05-27 19:21:47 +00004380 PyObject *sub,
4381 Py_ssize_t start,
4382 Py_ssize_t end,
4383 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004384{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004385 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00004386
Guido van Rossumd57fd912000-03-10 22:53:23 +00004387 str = PyUnicode_FromObject(str);
Thomas Wouters477c8d52006-05-27 19:21:47 +00004388 if (!str)
Marc-André Lemburg4da6fd62002-05-29 11:33:13 +00004389 return -2;
Thomas Wouters477c8d52006-05-27 19:21:47 +00004390 sub = PyUnicode_FromObject(sub);
4391 if (!sub) {
Marc-André Lemburg41644392002-05-29 13:46:29 +00004392 Py_DECREF(str);
Marc-André Lemburg4da6fd62002-05-29 11:33:13 +00004393 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004394 }
Tim Petersced69f82003-09-16 20:30:58 +00004395
Thomas Wouters477c8d52006-05-27 19:21:47 +00004396 if (direction > 0)
4397 result = stringlib_find_slice(
4398 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
4399 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
4400 start, end
4401 );
4402 else
4403 result = stringlib_rfind_slice(
4404 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
4405 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
4406 start, end
4407 );
4408
Guido van Rossumd57fd912000-03-10 22:53:23 +00004409 Py_DECREF(str);
Thomas Wouters477c8d52006-05-27 19:21:47 +00004410 Py_DECREF(sub);
4411
Guido van Rossumd57fd912000-03-10 22:53:23 +00004412 return result;
4413}
4414
Tim Petersced69f82003-09-16 20:30:58 +00004415static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004416int tailmatch(PyUnicodeObject *self,
4417 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004418 Py_ssize_t start,
4419 Py_ssize_t end,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004420 int direction)
4421{
Guido van Rossumd57fd912000-03-10 22:53:23 +00004422 if (substring->length == 0)
4423 return 1;
4424
Thomas Wouters477c8d52006-05-27 19:21:47 +00004425 FIX_START_END(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004426
4427 end -= substring->length;
4428 if (end < start)
4429 return 0;
4430
4431 if (direction > 0) {
4432 if (Py_UNICODE_MATCH(self, end, substring))
4433 return 1;
4434 } else {
4435 if (Py_UNICODE_MATCH(self, start, substring))
4436 return 1;
4437 }
4438
4439 return 0;
4440}
4441
Martin v. Löwis18e16552006-02-15 17:27:45 +00004442Py_ssize_t PyUnicode_Tailmatch(PyObject *str,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004443 PyObject *substr,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004444 Py_ssize_t start,
4445 Py_ssize_t end,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004446 int direction)
4447{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004448 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00004449
Guido van Rossumd57fd912000-03-10 22:53:23 +00004450 str = PyUnicode_FromObject(str);
4451 if (str == NULL)
4452 return -1;
4453 substr = PyUnicode_FromObject(substr);
4454 if (substr == NULL) {
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00004455 Py_DECREF(str);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004456 return -1;
4457 }
Tim Petersced69f82003-09-16 20:30:58 +00004458
Guido van Rossumd57fd912000-03-10 22:53:23 +00004459 result = tailmatch((PyUnicodeObject *)str,
4460 (PyUnicodeObject *)substr,
4461 start, end, direction);
4462 Py_DECREF(str);
4463 Py_DECREF(substr);
4464 return result;
4465}
4466
Guido van Rossumd57fd912000-03-10 22:53:23 +00004467/* Apply fixfct filter to the Unicode object self and return a
4468 reference to the modified object */
4469
Tim Petersced69f82003-09-16 20:30:58 +00004470static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004471PyObject *fixup(PyUnicodeObject *self,
4472 int (*fixfct)(PyUnicodeObject *s))
4473{
4474
4475 PyUnicodeObject *u;
4476
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004477 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004478 if (u == NULL)
4479 return NULL;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004480
4481 Py_UNICODE_COPY(u->str, self->str, self->length);
4482
Tim Peters7a29bd52001-09-12 03:03:31 +00004483 if (!fixfct(u) && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004484 /* fixfct should return TRUE if it modified the buffer. If
4485 FALSE, return a reference to the original buffer instead
4486 (to save space, not time) */
4487 Py_INCREF(self);
4488 Py_DECREF(u);
4489 return (PyObject*) self;
4490 }
4491 return (PyObject*) u;
4492}
4493
Tim Petersced69f82003-09-16 20:30:58 +00004494static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004495int fixupper(PyUnicodeObject *self)
4496{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004497 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004498 Py_UNICODE *s = self->str;
4499 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00004500
Guido van Rossumd57fd912000-03-10 22:53:23 +00004501 while (len-- > 0) {
4502 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00004503
Guido van Rossumd57fd912000-03-10 22:53:23 +00004504 ch = Py_UNICODE_TOUPPER(*s);
4505 if (ch != *s) {
4506 status = 1;
4507 *s = ch;
4508 }
4509 s++;
4510 }
4511
4512 return status;
4513}
4514
Tim Petersced69f82003-09-16 20:30:58 +00004515static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004516int fixlower(PyUnicodeObject *self)
4517{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004518 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004519 Py_UNICODE *s = self->str;
4520 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00004521
Guido van Rossumd57fd912000-03-10 22:53:23 +00004522 while (len-- > 0) {
4523 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00004524
Guido van Rossumd57fd912000-03-10 22:53:23 +00004525 ch = Py_UNICODE_TOLOWER(*s);
4526 if (ch != *s) {
4527 status = 1;
4528 *s = ch;
4529 }
4530 s++;
4531 }
4532
4533 return status;
4534}
4535
Tim Petersced69f82003-09-16 20:30:58 +00004536static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004537int fixswapcase(PyUnicodeObject *self)
4538{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004539 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004540 Py_UNICODE *s = self->str;
4541 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00004542
Guido van Rossumd57fd912000-03-10 22:53:23 +00004543 while (len-- > 0) {
4544 if (Py_UNICODE_ISUPPER(*s)) {
4545 *s = Py_UNICODE_TOLOWER(*s);
4546 status = 1;
4547 } else if (Py_UNICODE_ISLOWER(*s)) {
4548 *s = Py_UNICODE_TOUPPER(*s);
4549 status = 1;
4550 }
4551 s++;
4552 }
4553
4554 return status;
4555}
4556
Tim Petersced69f82003-09-16 20:30:58 +00004557static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004558int fixcapitalize(PyUnicodeObject *self)
4559{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004560 Py_ssize_t len = self->length;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00004561 Py_UNICODE *s = self->str;
4562 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00004563
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00004564 if (len == 0)
4565 return 0;
4566 if (Py_UNICODE_ISLOWER(*s)) {
4567 *s = Py_UNICODE_TOUPPER(*s);
4568 status = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004569 }
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00004570 s++;
4571 while (--len > 0) {
4572 if (Py_UNICODE_ISUPPER(*s)) {
4573 *s = Py_UNICODE_TOLOWER(*s);
4574 status = 1;
4575 }
4576 s++;
4577 }
4578 return status;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004579}
4580
4581static
4582int fixtitle(PyUnicodeObject *self)
4583{
4584 register Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
4585 register Py_UNICODE *e;
4586 int previous_is_cased;
4587
4588 /* Shortcut for single character strings */
4589 if (PyUnicode_GET_SIZE(self) == 1) {
4590 Py_UNICODE ch = Py_UNICODE_TOTITLE(*p);
4591 if (*p != ch) {
4592 *p = ch;
4593 return 1;
4594 }
4595 else
4596 return 0;
4597 }
Tim Petersced69f82003-09-16 20:30:58 +00004598
Guido van Rossumd57fd912000-03-10 22:53:23 +00004599 e = p + PyUnicode_GET_SIZE(self);
4600 previous_is_cased = 0;
4601 for (; p < e; p++) {
4602 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00004603
Guido van Rossumd57fd912000-03-10 22:53:23 +00004604 if (previous_is_cased)
4605 *p = Py_UNICODE_TOLOWER(ch);
4606 else
4607 *p = Py_UNICODE_TOTITLE(ch);
Tim Petersced69f82003-09-16 20:30:58 +00004608
4609 if (Py_UNICODE_ISLOWER(ch) ||
4610 Py_UNICODE_ISUPPER(ch) ||
Guido van Rossumd57fd912000-03-10 22:53:23 +00004611 Py_UNICODE_ISTITLE(ch))
4612 previous_is_cased = 1;
4613 else
4614 previous_is_cased = 0;
4615 }
4616 return 1;
4617}
4618
Tim Peters8ce9f162004-08-27 01:49:32 +00004619PyObject *
4620PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004621{
Tim Peters8ce9f162004-08-27 01:49:32 +00004622 PyObject *internal_separator = NULL;
Skip Montanaro6543b452004-09-16 03:28:13 +00004623 const Py_UNICODE blank = ' ';
4624 const Py_UNICODE *sep = &blank;
Thomas Wouters477c8d52006-05-27 19:21:47 +00004625 Py_ssize_t seplen = 1;
Tim Peters05eba1f2004-08-27 21:32:02 +00004626 PyUnicodeObject *res = NULL; /* the result */
Thomas Wouters477c8d52006-05-27 19:21:47 +00004627 Py_ssize_t res_alloc = 100; /* # allocated bytes for string in res */
4628 Py_ssize_t res_used; /* # used bytes */
Tim Peters05eba1f2004-08-27 21:32:02 +00004629 Py_UNICODE *res_p; /* pointer to free byte in res's string area */
4630 PyObject *fseq; /* PySequence_Fast(seq) */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004631 Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */
Tim Peters8ce9f162004-08-27 01:49:32 +00004632 PyObject *item;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004633 Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004634
Tim Peters05eba1f2004-08-27 21:32:02 +00004635 fseq = PySequence_Fast(seq, "");
4636 if (fseq == NULL) {
Tim Peters05eba1f2004-08-27 21:32:02 +00004637 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +00004638 }
4639
Tim Peters91879ab2004-08-27 22:35:44 +00004640 /* Grrrr. A codec may be invoked to convert str objects to
4641 * Unicode, and so it's possible to call back into Python code
4642 * during PyUnicode_FromObject(), and so it's possible for a sick
4643 * codec to change the size of fseq (if seq is a list). Therefore
4644 * we have to keep refetching the size -- can't assume seqlen
4645 * is invariant.
4646 */
Tim Peters05eba1f2004-08-27 21:32:02 +00004647 seqlen = PySequence_Fast_GET_SIZE(fseq);
4648 /* If empty sequence, return u"". */
4649 if (seqlen == 0) {
4650 res = _PyUnicode_New(0); /* empty sequence; return u"" */
4651 goto Done;
4652 }
4653 /* If singleton sequence with an exact Unicode, return that. */
4654 if (seqlen == 1) {
4655 item = PySequence_Fast_GET_ITEM(fseq, 0);
4656 if (PyUnicode_CheckExact(item)) {
4657 Py_INCREF(item);
4658 res = (PyUnicodeObject *)item;
4659 goto Done;
4660 }
Tim Peters8ce9f162004-08-27 01:49:32 +00004661 }
4662
Tim Peters05eba1f2004-08-27 21:32:02 +00004663 /* At least two items to join, or one that isn't exact Unicode. */
4664 if (seqlen > 1) {
4665 /* Set up sep and seplen -- they're needed. */
4666 if (separator == NULL) {
4667 sep = &blank;
4668 seplen = 1;
4669 }
4670 else {
4671 internal_separator = PyUnicode_FromObject(separator);
4672 if (internal_separator == NULL)
4673 goto onError;
4674 sep = PyUnicode_AS_UNICODE(internal_separator);
4675 seplen = PyUnicode_GET_SIZE(internal_separator);
Tim Peters91879ab2004-08-27 22:35:44 +00004676 /* In case PyUnicode_FromObject() mutated seq. */
4677 seqlen = PySequence_Fast_GET_SIZE(fseq);
Tim Peters05eba1f2004-08-27 21:32:02 +00004678 }
4679 }
4680
4681 /* Get space. */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004682 res = _PyUnicode_New(res_alloc);
Tim Peters05eba1f2004-08-27 21:32:02 +00004683 if (res == NULL)
Tim Peters8ce9f162004-08-27 01:49:32 +00004684 goto onError;
Tim Peters05eba1f2004-08-27 21:32:02 +00004685 res_p = PyUnicode_AS_UNICODE(res);
4686 res_used = 0;
Tim Peters8ce9f162004-08-27 01:49:32 +00004687
Tim Peters05eba1f2004-08-27 21:32:02 +00004688 for (i = 0; i < seqlen; ++i) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00004689 Py_ssize_t itemlen;
4690 Py_ssize_t new_res_used;
Tim Peters05eba1f2004-08-27 21:32:02 +00004691
4692 item = PySequence_Fast_GET_ITEM(fseq, i);
4693 /* Convert item to Unicode. */
4694 if (! PyUnicode_Check(item) && ! PyString_Check(item)) {
4695 PyErr_Format(PyExc_TypeError,
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004696 "sequence item %zd: expected string or Unicode,"
Tim Peters05eba1f2004-08-27 21:32:02 +00004697 " %.80s found",
4698 i, item->ob_type->tp_name);
Tim Peters2cfe3682001-05-05 05:36:48 +00004699 goto onError;
Tim Peters8ce9f162004-08-27 01:49:32 +00004700 }
Tim Peters05eba1f2004-08-27 21:32:02 +00004701 item = PyUnicode_FromObject(item);
4702 if (item == NULL)
4703 goto onError;
4704 /* We own a reference to item from here on. */
Tim Petersced69f82003-09-16 20:30:58 +00004705
Tim Peters91879ab2004-08-27 22:35:44 +00004706 /* In case PyUnicode_FromObject() mutated seq. */
4707 seqlen = PySequence_Fast_GET_SIZE(fseq);
4708
Tim Peters8ce9f162004-08-27 01:49:32 +00004709 /* Make sure we have enough space for the separator and the item. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004710 itemlen = PyUnicode_GET_SIZE(item);
Tim Peters05eba1f2004-08-27 21:32:02 +00004711 new_res_used = res_used + itemlen;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004712 if (new_res_used < 0)
Tim Peters8ce9f162004-08-27 01:49:32 +00004713 goto Overflow;
Tim Peters05eba1f2004-08-27 21:32:02 +00004714 if (i < seqlen - 1) {
4715 new_res_used += seplen;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004716 if (new_res_used < 0)
Tim Peters05eba1f2004-08-27 21:32:02 +00004717 goto Overflow;
4718 }
4719 if (new_res_used > res_alloc) {
4720 /* double allocated size until it's big enough */
Tim Peters8ce9f162004-08-27 01:49:32 +00004721 do {
Tim Peters05eba1f2004-08-27 21:32:02 +00004722 res_alloc += res_alloc;
Thomas Wouters477c8d52006-05-27 19:21:47 +00004723 if (res_alloc <= 0)
Tim Peters8ce9f162004-08-27 01:49:32 +00004724 goto Overflow;
Tim Peters05eba1f2004-08-27 21:32:02 +00004725 } while (new_res_used > res_alloc);
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004726 if (_PyUnicode_Resize(&res, res_alloc) < 0) {
Marc-André Lemburg3508e302001-09-20 17:22:58 +00004727 Py_DECREF(item);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004728 goto onError;
Marc-André Lemburg3508e302001-09-20 17:22:58 +00004729 }
Tim Peters05eba1f2004-08-27 21:32:02 +00004730 res_p = PyUnicode_AS_UNICODE(res) + res_used;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004731 }
Tim Peters05eba1f2004-08-27 21:32:02 +00004732
4733 /* Copy item, and maybe the separator. */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004734 Py_UNICODE_COPY(res_p, PyUnicode_AS_UNICODE(item), itemlen);
Tim Peters05eba1f2004-08-27 21:32:02 +00004735 res_p += itemlen;
4736 if (i < seqlen - 1) {
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004737 Py_UNICODE_COPY(res_p, sep, seplen);
Tim Peters05eba1f2004-08-27 21:32:02 +00004738 res_p += seplen;
4739 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004740 Py_DECREF(item);
Tim Peters05eba1f2004-08-27 21:32:02 +00004741 res_used = new_res_used;
4742 }
Tim Peters8ce9f162004-08-27 01:49:32 +00004743
Tim Peters05eba1f2004-08-27 21:32:02 +00004744 /* Shrink res to match the used area; this probably can't fail,
4745 * but it's cheap to check.
4746 */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004747 if (_PyUnicode_Resize(&res, res_used) < 0)
Tim Peters8ce9f162004-08-27 01:49:32 +00004748 goto onError;
4749
4750 Done:
4751 Py_XDECREF(internal_separator);
Tim Peters05eba1f2004-08-27 21:32:02 +00004752 Py_DECREF(fseq);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004753 return (PyObject *)res;
4754
Tim Peters8ce9f162004-08-27 01:49:32 +00004755 Overflow:
4756 PyErr_SetString(PyExc_OverflowError,
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004757 "join() result is too long for a Python string");
Tim Peters8ce9f162004-08-27 01:49:32 +00004758 Py_DECREF(item);
4759 /* fall through */
4760
Guido van Rossumd57fd912000-03-10 22:53:23 +00004761 onError:
Tim Peters8ce9f162004-08-27 01:49:32 +00004762 Py_XDECREF(internal_separator);
Tim Peters05eba1f2004-08-27 21:32:02 +00004763 Py_DECREF(fseq);
Tim Peters8ce9f162004-08-27 01:49:32 +00004764 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004765 return NULL;
4766}
4767
Tim Petersced69f82003-09-16 20:30:58 +00004768static
4769PyUnicodeObject *pad(PyUnicodeObject *self,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004770 Py_ssize_t left,
4771 Py_ssize_t right,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004772 Py_UNICODE fill)
4773{
4774 PyUnicodeObject *u;
4775
4776 if (left < 0)
4777 left = 0;
4778 if (right < 0)
4779 right = 0;
4780
Tim Peters7a29bd52001-09-12 03:03:31 +00004781 if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004782 Py_INCREF(self);
4783 return self;
4784 }
4785
4786 u = _PyUnicode_New(left + self->length + right);
4787 if (u) {
4788 if (left)
4789 Py_UNICODE_FILL(u->str, fill, left);
4790 Py_UNICODE_COPY(u->str + left, self->str, self->length);
4791 if (right)
4792 Py_UNICODE_FILL(u->str + left + self->length, fill, right);
4793 }
4794
4795 return u;
4796}
4797
4798#define SPLIT_APPEND(data, left, right) \
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004799 str = PyUnicode_FromUnicode((data) + (left), (right) - (left)); \
Guido van Rossumd57fd912000-03-10 22:53:23 +00004800 if (!str) \
4801 goto onError; \
4802 if (PyList_Append(list, str)) { \
4803 Py_DECREF(str); \
4804 goto onError; \
4805 } \
4806 else \
4807 Py_DECREF(str);
4808
4809static
4810PyObject *split_whitespace(PyUnicodeObject *self,
4811 PyObject *list,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004812 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004813{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004814 register Py_ssize_t i;
4815 register Py_ssize_t j;
4816 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004817 PyObject *str;
4818
4819 for (i = j = 0; i < len; ) {
4820 /* find a token */
4821 while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
4822 i++;
4823 j = i;
4824 while (i < len && !Py_UNICODE_ISSPACE(self->str[i]))
4825 i++;
4826 if (j < i) {
4827 if (maxcount-- <= 0)
4828 break;
4829 SPLIT_APPEND(self->str, j, i);
4830 while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
4831 i++;
4832 j = i;
4833 }
4834 }
4835 if (j < len) {
4836 SPLIT_APPEND(self->str, j, len);
4837 }
4838 return list;
4839
4840 onError:
4841 Py_DECREF(list);
4842 return NULL;
4843}
4844
4845PyObject *PyUnicode_Splitlines(PyObject *string,
Guido van Rossum86662912000-04-11 15:38:46 +00004846 int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004847{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004848 register Py_ssize_t i;
4849 register Py_ssize_t j;
4850 Py_ssize_t len;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004851 PyObject *list;
4852 PyObject *str;
4853 Py_UNICODE *data;
4854
4855 string = PyUnicode_FromObject(string);
4856 if (string == NULL)
4857 return NULL;
4858 data = PyUnicode_AS_UNICODE(string);
4859 len = PyUnicode_GET_SIZE(string);
4860
Guido van Rossumd57fd912000-03-10 22:53:23 +00004861 list = PyList_New(0);
4862 if (!list)
4863 goto onError;
4864
4865 for (i = j = 0; i < len; ) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00004866 Py_ssize_t eol;
Tim Petersced69f82003-09-16 20:30:58 +00004867
Guido van Rossumd57fd912000-03-10 22:53:23 +00004868 /* Find a line and append it */
Thomas Wouters477c8d52006-05-27 19:21:47 +00004869 while (i < len && !BLOOM_LINEBREAK(data[i]))
Guido van Rossumd57fd912000-03-10 22:53:23 +00004870 i++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004871
4872 /* Skip the line break reading CRLF as one line break */
Guido van Rossum86662912000-04-11 15:38:46 +00004873 eol = i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004874 if (i < len) {
4875 if (data[i] == '\r' && i + 1 < len &&
4876 data[i+1] == '\n')
4877 i += 2;
4878 else
4879 i++;
Guido van Rossum86662912000-04-11 15:38:46 +00004880 if (keepends)
4881 eol = i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004882 }
Guido van Rossum86662912000-04-11 15:38:46 +00004883 SPLIT_APPEND(data, j, eol);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004884 j = i;
4885 }
4886 if (j < len) {
4887 SPLIT_APPEND(data, j, len);
4888 }
4889
4890 Py_DECREF(string);
4891 return list;
4892
4893 onError:
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00004894 Py_XDECREF(list);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004895 Py_DECREF(string);
4896 return NULL;
4897}
4898
Tim Petersced69f82003-09-16 20:30:58 +00004899static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004900PyObject *split_char(PyUnicodeObject *self,
4901 PyObject *list,
4902 Py_UNICODE ch,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004903 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004904{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004905 register Py_ssize_t i;
4906 register Py_ssize_t j;
4907 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004908 PyObject *str;
4909
4910 for (i = j = 0; i < len; ) {
4911 if (self->str[i] == ch) {
4912 if (maxcount-- <= 0)
4913 break;
4914 SPLIT_APPEND(self->str, j, i);
4915 i = j = i + 1;
4916 } else
4917 i++;
4918 }
4919 if (j <= len) {
4920 SPLIT_APPEND(self->str, j, len);
4921 }
4922 return list;
4923
4924 onError:
4925 Py_DECREF(list);
4926 return NULL;
4927}
4928
Tim Petersced69f82003-09-16 20:30:58 +00004929static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004930PyObject *split_substring(PyUnicodeObject *self,
4931 PyObject *list,
4932 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004933 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004934{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004935 register Py_ssize_t i;
4936 register Py_ssize_t j;
4937 Py_ssize_t len = self->length;
4938 Py_ssize_t sublen = substring->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004939 PyObject *str;
4940
Guido van Rossumcda4f9a2000-12-19 02:23:19 +00004941 for (i = j = 0; i <= len - sublen; ) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004942 if (Py_UNICODE_MATCH(self, i, substring)) {
4943 if (maxcount-- <= 0)
4944 break;
4945 SPLIT_APPEND(self->str, j, i);
4946 i = j = i + sublen;
4947 } else
4948 i++;
4949 }
4950 if (j <= len) {
4951 SPLIT_APPEND(self->str, j, len);
4952 }
4953 return list;
4954
4955 onError:
4956 Py_DECREF(list);
4957 return NULL;
4958}
4959
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004960static
4961PyObject *rsplit_whitespace(PyUnicodeObject *self,
4962 PyObject *list,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004963 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004964{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004965 register Py_ssize_t i;
4966 register Py_ssize_t j;
4967 Py_ssize_t len = self->length;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004968 PyObject *str;
4969
4970 for (i = j = len - 1; i >= 0; ) {
4971 /* find a token */
4972 while (i >= 0 && Py_UNICODE_ISSPACE(self->str[i]))
4973 i--;
4974 j = i;
4975 while (i >= 0 && !Py_UNICODE_ISSPACE(self->str[i]))
4976 i--;
4977 if (j > i) {
4978 if (maxcount-- <= 0)
4979 break;
Thomas Wouters477c8d52006-05-27 19:21:47 +00004980 SPLIT_APPEND(self->str, i + 1, j + 1);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004981 while (i >= 0 && Py_UNICODE_ISSPACE(self->str[i]))
4982 i--;
4983 j = i;
4984 }
4985 }
4986 if (j >= 0) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00004987 SPLIT_APPEND(self->str, 0, j + 1);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004988 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00004989 if (PyList_Reverse(list) < 0)
4990 goto onError;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004991 return list;
4992
4993 onError:
4994 Py_DECREF(list);
4995 return NULL;
4996}
4997
4998static
4999PyObject *rsplit_char(PyUnicodeObject *self,
5000 PyObject *list,
5001 Py_UNICODE ch,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005002 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005003{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005004 register Py_ssize_t i;
5005 register Py_ssize_t j;
5006 Py_ssize_t len = self->length;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005007 PyObject *str;
5008
5009 for (i = j = len - 1; i >= 0; ) {
5010 if (self->str[i] == ch) {
5011 if (maxcount-- <= 0)
5012 break;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005013 SPLIT_APPEND(self->str, i + 1, j + 1);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005014 j = i = i - 1;
5015 } else
5016 i--;
5017 }
Hye-Shik Chang7fc4cf52003-12-23 09:10:16 +00005018 if (j >= -1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00005019 SPLIT_APPEND(self->str, 0, j + 1);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005020 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00005021 if (PyList_Reverse(list) < 0)
5022 goto onError;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005023 return list;
5024
5025 onError:
5026 Py_DECREF(list);
5027 return NULL;
5028}
5029
5030static
5031PyObject *rsplit_substring(PyUnicodeObject *self,
5032 PyObject *list,
5033 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005034 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005035{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005036 register Py_ssize_t i;
5037 register Py_ssize_t j;
5038 Py_ssize_t len = self->length;
5039 Py_ssize_t sublen = substring->length;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005040 PyObject *str;
5041
5042 for (i = len - sublen, j = len; i >= 0; ) {
5043 if (Py_UNICODE_MATCH(self, i, substring)) {
5044 if (maxcount-- <= 0)
5045 break;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005046 SPLIT_APPEND(self->str, i + sublen, j);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005047 j = i;
5048 i -= sublen;
5049 } else
5050 i--;
5051 }
5052 if (j >= 0) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00005053 SPLIT_APPEND(self->str, 0, j);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005054 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00005055 if (PyList_Reverse(list) < 0)
5056 goto onError;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005057 return list;
5058
5059 onError:
5060 Py_DECREF(list);
5061 return NULL;
5062}
5063
Guido van Rossumd57fd912000-03-10 22:53:23 +00005064#undef SPLIT_APPEND
5065
5066static
5067PyObject *split(PyUnicodeObject *self,
5068 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005069 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005070{
5071 PyObject *list;
5072
5073 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005074 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005075
5076 list = PyList_New(0);
5077 if (!list)
5078 return NULL;
5079
5080 if (substring == NULL)
5081 return split_whitespace(self,list,maxcount);
5082
5083 else if (substring->length == 1)
5084 return split_char(self,list,substring->str[0],maxcount);
5085
5086 else if (substring->length == 0) {
5087 Py_DECREF(list);
5088 PyErr_SetString(PyExc_ValueError, "empty separator");
5089 return NULL;
5090 }
5091 else
5092 return split_substring(self,list,substring,maxcount);
5093}
5094
Tim Petersced69f82003-09-16 20:30:58 +00005095static
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005096PyObject *rsplit(PyUnicodeObject *self,
5097 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005098 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005099{
5100 PyObject *list;
5101
5102 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005103 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005104
5105 list = PyList_New(0);
5106 if (!list)
5107 return NULL;
5108
5109 if (substring == NULL)
5110 return rsplit_whitespace(self,list,maxcount);
5111
5112 else if (substring->length == 1)
5113 return rsplit_char(self,list,substring->str[0],maxcount);
5114
5115 else if (substring->length == 0) {
5116 Py_DECREF(list);
5117 PyErr_SetString(PyExc_ValueError, "empty separator");
5118 return NULL;
5119 }
5120 else
5121 return rsplit_substring(self,list,substring,maxcount);
5122}
5123
5124static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005125PyObject *replace(PyUnicodeObject *self,
5126 PyUnicodeObject *str1,
5127 PyUnicodeObject *str2,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005128 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005129{
5130 PyUnicodeObject *u;
5131
5132 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005133 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005134
Thomas Wouters477c8d52006-05-27 19:21:47 +00005135 if (str1->length == str2->length) {
5136 /* same length */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005137 Py_ssize_t i;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005138 if (str1->length == 1) {
5139 /* replace characters */
5140 Py_UNICODE u1, u2;
5141 if (!findchar(self->str, self->length, str1->str[0]))
5142 goto nothing;
5143 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
5144 if (!u)
5145 return NULL;
5146 Py_UNICODE_COPY(u->str, self->str, self->length);
5147 u1 = str1->str[0];
5148 u2 = str2->str[0];
5149 for (i = 0; i < u->length; i++)
5150 if (u->str[i] == u1) {
5151 if (--maxcount < 0)
5152 break;
5153 u->str[i] = u2;
5154 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005155 } else {
Thomas Wouters477c8d52006-05-27 19:21:47 +00005156 i = fastsearch(
5157 self->str, self->length, str1->str, str1->length, FAST_SEARCH
Guido van Rossumd57fd912000-03-10 22:53:23 +00005158 );
Thomas Wouters477c8d52006-05-27 19:21:47 +00005159 if (i < 0)
5160 goto nothing;
5161 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
5162 if (!u)
5163 return NULL;
5164 Py_UNICODE_COPY(u->str, self->str, self->length);
5165 while (i <= self->length - str1->length)
5166 if (Py_UNICODE_MATCH(self, i, str1)) {
5167 if (--maxcount < 0)
5168 break;
5169 Py_UNICODE_COPY(u->str+i, str2->str, str2->length);
5170 i += str1->length;
5171 } else
5172 i++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005173 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005174 } else {
Thomas Wouters477c8d52006-05-27 19:21:47 +00005175
5176 Py_ssize_t n, i, j, e;
5177 Py_ssize_t product, new_size, delta;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005178 Py_UNICODE *p;
5179
5180 /* replace strings */
Thomas Wouters477c8d52006-05-27 19:21:47 +00005181 n = stringlib_count(self->str, self->length, str1->str, str1->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005182 if (n > maxcount)
5183 n = maxcount;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005184 if (n == 0)
5185 goto nothing;
5186 /* new_size = self->length + n * (str2->length - str1->length)); */
5187 delta = (str2->length - str1->length);
5188 if (delta == 0) {
5189 new_size = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005190 } else {
Thomas Wouters477c8d52006-05-27 19:21:47 +00005191 product = n * (str2->length - str1->length);
5192 if ((product / (str2->length - str1->length)) != n) {
5193 PyErr_SetString(PyExc_OverflowError,
5194 "replace string is too long");
5195 return NULL;
5196 }
5197 new_size = self->length + product;
5198 if (new_size < 0) {
5199 PyErr_SetString(PyExc_OverflowError,
5200 "replace string is too long");
5201 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005202 }
5203 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00005204 u = _PyUnicode_New(new_size);
5205 if (!u)
5206 return NULL;
5207 i = 0;
5208 p = u->str;
5209 e = self->length - str1->length;
5210 if (str1->length > 0) {
5211 while (n-- > 0) {
5212 /* look for next match */
5213 j = i;
5214 while (j <= e) {
5215 if (Py_UNICODE_MATCH(self, j, str1))
5216 break;
5217 j++;
5218 }
5219 if (j > i) {
5220 if (j > e)
5221 break;
5222 /* copy unchanged part [i:j] */
5223 Py_UNICODE_COPY(p, self->str+i, j-i);
5224 p += j - i;
5225 }
5226 /* copy substitution string */
5227 if (str2->length > 0) {
5228 Py_UNICODE_COPY(p, str2->str, str2->length);
5229 p += str2->length;
5230 }
5231 i = j + str1->length;
5232 }
5233 if (i < self->length)
5234 /* copy tail [i:] */
5235 Py_UNICODE_COPY(p, self->str+i, self->length-i);
5236 } else {
5237 /* interleave */
5238 while (n > 0) {
5239 Py_UNICODE_COPY(p, str2->str, str2->length);
5240 p += str2->length;
5241 if (--n <= 0)
5242 break;
5243 *p++ = self->str[i++];
5244 }
5245 Py_UNICODE_COPY(p, self->str+i, self->length-i);
5246 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005247 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005248 return (PyObject *) u;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005249
5250nothing:
5251 /* nothing to replace; return original string (when possible) */
5252 if (PyUnicode_CheckExact(self)) {
5253 Py_INCREF(self);
5254 return (PyObject *) self;
5255 }
5256 return PyUnicode_FromUnicode(self->str, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005257}
5258
5259/* --- Unicode Object Methods --------------------------------------------- */
5260
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005261PyDoc_STRVAR(title__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005262"S.title() -> unicode\n\
5263\n\
5264Return a titlecased version of S, i.e. words start with title case\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005265characters, all remaining cased characters have lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005266
5267static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005268unicode_title(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005269{
Guido van Rossumd57fd912000-03-10 22:53:23 +00005270 return fixup(self, fixtitle);
5271}
5272
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005273PyDoc_STRVAR(capitalize__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005274"S.capitalize() -> unicode\n\
5275\n\
5276Return a capitalized version of S, i.e. make the first character\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005277have upper case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005278
5279static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005280unicode_capitalize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005281{
Guido van Rossumd57fd912000-03-10 22:53:23 +00005282 return fixup(self, fixcapitalize);
5283}
5284
5285#if 0
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005286PyDoc_STRVAR(capwords__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005287"S.capwords() -> unicode\n\
5288\n\
5289Apply .capitalize() to all words in S and return the result with\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005290normalized whitespace (all whitespace strings are replaced by ' ').");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005291
5292static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005293unicode_capwords(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005294{
5295 PyObject *list;
5296 PyObject *item;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005297 Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005298
Guido van Rossumd57fd912000-03-10 22:53:23 +00005299 /* Split into words */
5300 list = split(self, NULL, -1);
5301 if (!list)
5302 return NULL;
5303
5304 /* Capitalize each word */
5305 for (i = 0; i < PyList_GET_SIZE(list); i++) {
5306 item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
5307 fixcapitalize);
5308 if (item == NULL)
5309 goto onError;
5310 Py_DECREF(PyList_GET_ITEM(list, i));
5311 PyList_SET_ITEM(list, i, item);
5312 }
5313
5314 /* Join the words to form a new string */
5315 item = PyUnicode_Join(NULL, list);
5316
5317onError:
5318 Py_DECREF(list);
5319 return (PyObject *)item;
5320}
5321#endif
5322
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00005323/* Argument converter. Coerces to a single unicode character */
5324
5325static int
5326convert_uc(PyObject *obj, void *addr)
5327{
5328 Py_UNICODE *fillcharloc = (Py_UNICODE *)addr;
5329 PyObject *uniobj;
5330 Py_UNICODE *unistr;
5331
5332 uniobj = PyUnicode_FromObject(obj);
5333 if (uniobj == NULL) {
5334 PyErr_SetString(PyExc_TypeError,
5335 "The fill character cannot be converted to Unicode");
5336 return 0;
5337 }
5338 if (PyUnicode_GET_SIZE(uniobj) != 1) {
5339 PyErr_SetString(PyExc_TypeError,
5340 "The fill character must be exactly one character long");
5341 Py_DECREF(uniobj);
5342 return 0;
5343 }
5344 unistr = PyUnicode_AS_UNICODE(uniobj);
5345 *fillcharloc = unistr[0];
5346 Py_DECREF(uniobj);
5347 return 1;
5348}
5349
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005350PyDoc_STRVAR(center__doc__,
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00005351"S.center(width[, fillchar]) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005352\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00005353Return S centered in a Unicode string of length width. Padding is\n\
5354done using the specified fill character (default is a space)");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005355
5356static PyObject *
5357unicode_center(PyUnicodeObject *self, PyObject *args)
5358{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005359 Py_ssize_t marg, left;
5360 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00005361 Py_UNICODE fillchar = ' ';
Guido van Rossumd57fd912000-03-10 22:53:23 +00005362
Thomas Woutersde017742006-02-16 19:34:37 +00005363 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005364 return NULL;
5365
Tim Peters7a29bd52001-09-12 03:03:31 +00005366 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005367 Py_INCREF(self);
5368 return (PyObject*) self;
5369 }
5370
5371 marg = width - self->length;
5372 left = marg / 2 + (marg & width & 1);
5373
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00005374 return (PyObject*) pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005375}
5376
Marc-André Lemburge5034372000-08-08 08:04:29 +00005377#if 0
5378
5379/* This code should go into some future Unicode collation support
5380 module. The basic comparison should compare ordinals on a naive
Trent Mick20abf572000-08-12 22:14:34 +00005381 basis (this is what Java does and thus JPython too). */
Marc-André Lemburge5034372000-08-08 08:04:29 +00005382
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005383/* speedy UTF-16 code point order comparison */
5384/* gleaned from: */
5385/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
5386
Marc-André Lemburge12896e2000-07-07 17:51:08 +00005387static short utf16Fixup[32] =
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005388{
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005389 0, 0, 0, 0, 0, 0, 0, 0,
Tim Petersced69f82003-09-16 20:30:58 +00005390 0, 0, 0, 0, 0, 0, 0, 0,
5391 0, 0, 0, 0, 0, 0, 0, 0,
Marc-André Lemburge12896e2000-07-07 17:51:08 +00005392 0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005393};
5394
Guido van Rossumd57fd912000-03-10 22:53:23 +00005395static int
5396unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
5397{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005398 Py_ssize_t len1, len2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005399
Guido van Rossumd57fd912000-03-10 22:53:23 +00005400 Py_UNICODE *s1 = str1->str;
5401 Py_UNICODE *s2 = str2->str;
5402
5403 len1 = str1->length;
5404 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00005405
Guido van Rossumd57fd912000-03-10 22:53:23 +00005406 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00005407 Py_UNICODE c1, c2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005408
5409 c1 = *s1++;
5410 c2 = *s2++;
Fredrik Lundh45714e92001-06-26 16:39:36 +00005411
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005412 if (c1 > (1<<11) * 26)
5413 c1 += utf16Fixup[c1>>11];
5414 if (c2 > (1<<11) * 26)
5415 c2 += utf16Fixup[c2>>11];
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005416 /* now c1 and c2 are in UTF-32-compatible order */
Fredrik Lundh45714e92001-06-26 16:39:36 +00005417
5418 if (c1 != c2)
5419 return (c1 < c2) ? -1 : 1;
Tim Petersced69f82003-09-16 20:30:58 +00005420
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005421 len1--; len2--;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005422 }
5423
5424 return (len1 < len2) ? -1 : (len1 != len2);
5425}
5426
Marc-André Lemburge5034372000-08-08 08:04:29 +00005427#else
5428
5429static int
5430unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
5431{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005432 register Py_ssize_t len1, len2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00005433
5434 Py_UNICODE *s1 = str1->str;
5435 Py_UNICODE *s2 = str2->str;
5436
5437 len1 = str1->length;
5438 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00005439
Marc-André Lemburge5034372000-08-08 08:04:29 +00005440 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00005441 Py_UNICODE c1, c2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00005442
Fredrik Lundh45714e92001-06-26 16:39:36 +00005443 c1 = *s1++;
5444 c2 = *s2++;
5445
5446 if (c1 != c2)
5447 return (c1 < c2) ? -1 : 1;
5448
Marc-André Lemburge5034372000-08-08 08:04:29 +00005449 len1--; len2--;
5450 }
5451
5452 return (len1 < len2) ? -1 : (len1 != len2);
5453}
5454
5455#endif
5456
Guido van Rossumd57fd912000-03-10 22:53:23 +00005457int PyUnicode_Compare(PyObject *left,
5458 PyObject *right)
5459{
Guido van Rossum09dc34f2007-05-04 04:17:33 +00005460 if (PyUnicode_Check(left) && PyUnicode_Check(right))
5461 return unicode_compare((PyUnicodeObject *)left,
5462 (PyUnicodeObject *)right);
5463 if ((PyString_Check(left) && PyUnicode_Check(right)) ||
5464 (PyUnicode_Check(left) && PyString_Check(right))) {
5465 if (PyUnicode_Check(left))
5466 left = _PyUnicode_AsDefaultEncodedString(left, NULL);
5467 if (PyUnicode_Check(right))
5468 right = _PyUnicode_AsDefaultEncodedString(right, NULL);
5469 assert(PyString_Check(left));
5470 assert(PyString_Check(right));
5471 return PyObject_Compare(left, right);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005472 }
Guido van Rossum09dc34f2007-05-04 04:17:33 +00005473 PyErr_Format(PyExc_TypeError,
5474 "Can't compare %.100s and %.100s",
5475 left->ob_type->tp_name,
5476 right->ob_type->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005477 return -1;
5478}
5479
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00005480PyObject *PyUnicode_RichCompare(PyObject *left,
5481 PyObject *right,
5482 int op)
5483{
5484 int result;
5485
5486 result = PyUnicode_Compare(left, right);
5487 if (result == -1 && PyErr_Occurred())
5488 goto onError;
5489
5490 /* Convert the return value to a Boolean */
5491 switch (op) {
5492 case Py_EQ:
5493 result = (result == 0);
5494 break;
5495 case Py_NE:
5496 result = (result != 0);
5497 break;
5498 case Py_LE:
5499 result = (result <= 0);
5500 break;
5501 case Py_GE:
5502 result = (result >= 0);
5503 break;
5504 case Py_LT:
5505 result = (result == -1);
5506 break;
5507 case Py_GT:
5508 result = (result == 1);
5509 break;
5510 }
5511 return PyBool_FromLong(result);
5512
5513 onError:
5514
5515 /* Standard case
5516
5517 Type errors mean that PyUnicode_FromObject() could not convert
5518 one of the arguments (usually the right hand side) to Unicode,
5519 ie. we can't handle the comparison request. However, it is
5520 possible that the other object knows a comparison method, which
5521 is why we return Py_NotImplemented to give the other object a
5522 chance.
5523
5524 */
5525 if (PyErr_ExceptionMatches(PyExc_TypeError)) {
5526 PyErr_Clear();
5527 Py_INCREF(Py_NotImplemented);
5528 return Py_NotImplemented;
5529 }
5530 if (op != Py_EQ && op != Py_NE)
5531 return NULL;
5532
5533 /* Equality comparison.
5534
5535 This is a special case: we silence any PyExc_UnicodeDecodeError
5536 and instead turn it into a PyErr_UnicodeWarning.
5537
5538 */
5539 if (!PyErr_ExceptionMatches(PyExc_UnicodeDecodeError))
5540 return NULL;
5541 PyErr_Clear();
5542 if (PyErr_Warn(PyExc_UnicodeWarning,
5543 (op == Py_EQ) ?
5544 "Unicode equal comparison "
5545 "failed to convert both arguments to Unicode - "
5546 "interpreting them as being unequal" :
5547 "Unicode unequal comparison "
5548 "failed to convert both arguments to Unicode - "
5549 "interpreting them as being unequal"
5550 ) < 0)
5551 return NULL;
5552 result = (op == Py_NE);
5553 return PyBool_FromLong(result);
5554}
5555
Guido van Rossum403d68b2000-03-13 15:55:09 +00005556int PyUnicode_Contains(PyObject *container,
5557 PyObject *element)
5558{
Thomas Wouters477c8d52006-05-27 19:21:47 +00005559 PyObject *str, *sub;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005560 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00005561
5562 /* Coerce the two arguments */
Thomas Wouters477c8d52006-05-27 19:21:47 +00005563 sub = PyUnicode_FromObject(element);
5564 if (!sub) {
Marc-André Lemburg7c014682000-06-28 08:11:47 +00005565 PyErr_SetString(PyExc_TypeError,
Barry Warsaw817918c2002-08-06 16:58:21 +00005566 "'in <string>' requires string as left operand");
Thomas Wouters477c8d52006-05-27 19:21:47 +00005567 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +00005568 }
5569
Thomas Wouters477c8d52006-05-27 19:21:47 +00005570 str = PyUnicode_FromObject(container);
5571 if (!str) {
5572 Py_DECREF(sub);
5573 return -1;
5574 }
5575
5576 result = stringlib_contains_obj(str, sub);
5577
5578 Py_DECREF(str);
5579 Py_DECREF(sub);
5580
Guido van Rossum403d68b2000-03-13 15:55:09 +00005581 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00005582}
5583
Guido van Rossumd57fd912000-03-10 22:53:23 +00005584/* Concat to string or Unicode object giving a new Unicode object. */
5585
5586PyObject *PyUnicode_Concat(PyObject *left,
5587 PyObject *right)
5588{
5589 PyUnicodeObject *u = NULL, *v = NULL, *w;
5590
Guido van Rossum84d79dd2007-04-13 02:23:57 +00005591 if (PyBytes_Check(left) || PyBytes_Check(right))
5592 return PyBytes_Concat(left, right);
5593
Guido van Rossumd57fd912000-03-10 22:53:23 +00005594 /* Coerce the two arguments */
5595 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
5596 if (u == NULL)
5597 goto onError;
5598 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
5599 if (v == NULL)
5600 goto onError;
5601
5602 /* Shortcuts */
5603 if (v == unicode_empty) {
5604 Py_DECREF(v);
5605 return (PyObject *)u;
5606 }
5607 if (u == unicode_empty) {
5608 Py_DECREF(u);
5609 return (PyObject *)v;
5610 }
5611
5612 /* Concat the two Unicode strings */
5613 w = _PyUnicode_New(u->length + v->length);
5614 if (w == NULL)
5615 goto onError;
5616 Py_UNICODE_COPY(w->str, u->str, u->length);
5617 Py_UNICODE_COPY(w->str + u->length, v->str, v->length);
5618
5619 Py_DECREF(u);
5620 Py_DECREF(v);
5621 return (PyObject *)w;
5622
5623onError:
5624 Py_XDECREF(u);
5625 Py_XDECREF(v);
5626 return NULL;
5627}
5628
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005629PyDoc_STRVAR(count__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005630"S.count(sub[, start[, end]]) -> int\n\
5631\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00005632Return the number of non-overlapping occurrences of substring sub in\n\
5633Unicode string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005634interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005635
5636static PyObject *
5637unicode_count(PyUnicodeObject *self, PyObject *args)
5638{
5639 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005640 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005641 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005642 PyObject *result;
5643
Guido van Rossumb8872e62000-05-09 14:14:27 +00005644 if (!PyArg_ParseTuple(args, "O|O&O&:count", &substring,
5645 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005646 return NULL;
5647
5648 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Thomas Wouters477c8d52006-05-27 19:21:47 +00005649 (PyObject *)substring);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005650 if (substring == NULL)
5651 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +00005652
Thomas Wouters477c8d52006-05-27 19:21:47 +00005653 FIX_START_END(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005654
Thomas Wouters477c8d52006-05-27 19:21:47 +00005655 result = PyInt_FromSsize_t(
5656 stringlib_count(self->str + start, end - start,
5657 substring->str, substring->length)
5658 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00005659
5660 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00005661
Guido van Rossumd57fd912000-03-10 22:53:23 +00005662 return result;
5663}
5664
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005665PyDoc_STRVAR(encode__doc__,
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00005666"S.encode([encoding[,errors]]) -> string or unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005667\n\
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00005668Encodes S using the codec registered for encoding. encoding defaults\n\
5669to the default encoding. errors may be given to set a different error\n\
Fred Drakee4315f52000-05-09 19:53:39 +00005670handling scheme. Default is 'strict' meaning that encoding errors raise\n\
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005671a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
5672'xmlcharrefreplace' as well as any other name registered with\n\
5673codecs.register_error that can handle UnicodeEncodeErrors.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005674
5675static PyObject *
5676unicode_encode(PyUnicodeObject *self, PyObject *args)
5677{
5678 char *encoding = NULL;
5679 char *errors = NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00005680 PyObject *v;
5681
Guido van Rossumd57fd912000-03-10 22:53:23 +00005682 if (!PyArg_ParseTuple(args, "|ss:encode", &encoding, &errors))
5683 return NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00005684 v = PyUnicode_AsEncodedObject((PyObject *)self, encoding, errors);
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00005685 if (v == NULL)
5686 goto onError;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00005687 if (!PyBytes_Check(v)) {
Guido van Rossum4355a472007-05-04 05:00:04 +00005688 if (PyString_Check(v)) {
5689 /* Old codec, turn it into bytes */
5690 PyObject *b = PyBytes_FromObject(v);
5691 Py_DECREF(v);
5692 return b;
5693 }
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00005694 PyErr_Format(PyExc_TypeError,
Guido van Rossumf15a29f2007-05-04 00:41:39 +00005695 "encoder did not return a bytes object "
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00005696 "(type=%.400s)",
5697 v->ob_type->tp_name);
5698 Py_DECREF(v);
5699 return NULL;
5700 }
5701 return v;
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00005702
5703 onError:
5704 return NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00005705}
5706
5707PyDoc_STRVAR(decode__doc__,
5708"S.decode([encoding[,errors]]) -> string or unicode\n\
5709\n\
5710Decodes S using the codec registered for encoding. encoding defaults\n\
5711to the default encoding. errors may be given to set a different error\n\
5712handling scheme. Default is 'strict' meaning that encoding errors raise\n\
5713a UnicodeDecodeError. Other possible values are 'ignore' and 'replace'\n\
5714as well as any other name registerd with codecs.register_error that is\n\
5715able to handle UnicodeDecodeErrors.");
5716
5717static PyObject *
Marc-André Lemburg126b44c2004-07-10 12:04:20 +00005718unicode_decode(PyUnicodeObject *self, PyObject *args)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00005719{
5720 char *encoding = NULL;
5721 char *errors = NULL;
5722 PyObject *v;
5723
5724 if (!PyArg_ParseTuple(args, "|ss:decode", &encoding, &errors))
5725 return NULL;
5726 v = PyUnicode_AsDecodedObject((PyObject *)self, encoding, errors);
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00005727 if (v == NULL)
5728 goto onError;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00005729 if (!PyString_Check(v) && !PyUnicode_Check(v)) {
5730 PyErr_Format(PyExc_TypeError,
5731 "decoder did not return a string/unicode object "
5732 "(type=%.400s)",
5733 v->ob_type->tp_name);
5734 Py_DECREF(v);
5735 return NULL;
5736 }
5737 return v;
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00005738
5739 onError:
5740 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005741}
5742
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005743PyDoc_STRVAR(expandtabs__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005744"S.expandtabs([tabsize]) -> unicode\n\
5745\n\
5746Return a copy of S where all tab characters are expanded using spaces.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005747If tabsize is not given, a tab size of 8 characters is assumed.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005748
5749static PyObject*
5750unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
5751{
5752 Py_UNICODE *e;
5753 Py_UNICODE *p;
5754 Py_UNICODE *q;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005755 Py_ssize_t i, j;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005756 PyUnicodeObject *u;
5757 int tabsize = 8;
5758
5759 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
5760 return NULL;
5761
Thomas Wouters7e474022000-07-16 12:04:32 +00005762 /* First pass: determine size of output string */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005763 i = j = 0;
5764 e = self->str + self->length;
5765 for (p = self->str; p < e; p++)
5766 if (*p == '\t') {
5767 if (tabsize > 0)
5768 j += tabsize - (j % tabsize);
5769 }
5770 else {
5771 j++;
5772 if (*p == '\n' || *p == '\r') {
5773 i += j;
5774 j = 0;
5775 }
5776 }
5777
5778 /* Second pass: create output string and fill it */
5779 u = _PyUnicode_New(i + j);
5780 if (!u)
5781 return NULL;
5782
5783 j = 0;
5784 q = u->str;
5785
5786 for (p = self->str; p < e; p++)
5787 if (*p == '\t') {
5788 if (tabsize > 0) {
5789 i = tabsize - (j % tabsize);
5790 j += i;
5791 while (i--)
5792 *q++ = ' ';
5793 }
5794 }
5795 else {
5796 j++;
5797 *q++ = *p;
5798 if (*p == '\n' || *p == '\r')
5799 j = 0;
5800 }
5801
5802 return (PyObject*) u;
5803}
5804
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005805PyDoc_STRVAR(find__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005806"S.find(sub [,start [,end]]) -> int\n\
5807\n\
5808Return the lowest index in S where substring sub is found,\n\
5809such that sub is contained within s[start,end]. Optional\n\
5810arguments start and end are interpreted as in slice notation.\n\
5811\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005812Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005813
5814static PyObject *
5815unicode_find(PyUnicodeObject *self, PyObject *args)
5816{
Thomas Wouters477c8d52006-05-27 19:21:47 +00005817 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005818 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005819 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005820 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005821
Guido van Rossumb8872e62000-05-09 14:14:27 +00005822 if (!PyArg_ParseTuple(args, "O|O&O&:find", &substring,
5823 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005824 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005825 substring = PyUnicode_FromObject(substring);
5826 if (!substring)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005827 return NULL;
5828
Thomas Wouters477c8d52006-05-27 19:21:47 +00005829 result = stringlib_find_slice(
5830 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
5831 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
5832 start, end
5833 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00005834
5835 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00005836
5837 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005838}
5839
5840static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +00005841unicode_getitem(PyUnicodeObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005842{
5843 if (index < 0 || index >= self->length) {
5844 PyErr_SetString(PyExc_IndexError, "string index out of range");
5845 return NULL;
5846 }
5847
5848 return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1);
5849}
5850
5851static long
Guido van Rossum09dc34f2007-05-04 04:17:33 +00005852unicode_hash(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005853{
Guido van Rossum09dc34f2007-05-04 04:17:33 +00005854 /* Since Unicode objects compare equal to their UTF-8 string
5855 counterparts, we hash the UTF-8 string. */
5856 PyObject *v = _PyUnicode_AsDefaultEncodedString(self, NULL);
5857 return PyObject_Hash(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005858}
5859
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005860PyDoc_STRVAR(index__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005861"S.index(sub [,start [,end]]) -> int\n\
5862\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005863Like S.find() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005864
5865static PyObject *
5866unicode_index(PyUnicodeObject *self, PyObject *args)
5867{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005868 Py_ssize_t result;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005869 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005870 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005871 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005872
Guido van Rossumb8872e62000-05-09 14:14:27 +00005873 if (!PyArg_ParseTuple(args, "O|O&O&:index", &substring,
5874 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005875 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005876 substring = PyUnicode_FromObject(substring);
5877 if (!substring)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005878 return NULL;
5879
Thomas Wouters477c8d52006-05-27 19:21:47 +00005880 result = stringlib_find_slice(
5881 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
5882 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
5883 start, end
5884 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00005885
5886 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00005887
Guido van Rossumd57fd912000-03-10 22:53:23 +00005888 if (result < 0) {
5889 PyErr_SetString(PyExc_ValueError, "substring not found");
5890 return NULL;
5891 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00005892
Martin v. Löwis18e16552006-02-15 17:27:45 +00005893 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005894}
5895
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005896PyDoc_STRVAR(islower__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005897"S.islower() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005898\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00005899Return True if all cased characters in S are lowercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005900at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005901
5902static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005903unicode_islower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005904{
5905 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5906 register const Py_UNICODE *e;
5907 int cased;
5908
Guido van Rossumd57fd912000-03-10 22:53:23 +00005909 /* Shortcut for single character strings */
5910 if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005911 return PyBool_FromLong(Py_UNICODE_ISLOWER(*p));
Guido van Rossumd57fd912000-03-10 22:53:23 +00005912
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005913 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00005914 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005915 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005916
Guido van Rossumd57fd912000-03-10 22:53:23 +00005917 e = p + PyUnicode_GET_SIZE(self);
5918 cased = 0;
5919 for (; p < e; p++) {
5920 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00005921
Guido van Rossumd57fd912000-03-10 22:53:23 +00005922 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005923 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005924 else if (!cased && Py_UNICODE_ISLOWER(ch))
5925 cased = 1;
5926 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00005927 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005928}
5929
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005930PyDoc_STRVAR(isupper__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005931"S.isupper() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005932\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00005933Return True if all cased characters in S are uppercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005934at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005935
5936static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005937unicode_isupper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005938{
5939 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5940 register const Py_UNICODE *e;
5941 int cased;
5942
Guido van Rossumd57fd912000-03-10 22:53:23 +00005943 /* Shortcut for single character strings */
5944 if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005945 return PyBool_FromLong(Py_UNICODE_ISUPPER(*p) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005946
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005947 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00005948 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005949 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005950
Guido van Rossumd57fd912000-03-10 22:53:23 +00005951 e = p + PyUnicode_GET_SIZE(self);
5952 cased = 0;
5953 for (; p < e; p++) {
5954 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00005955
Guido van Rossumd57fd912000-03-10 22:53:23 +00005956 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005957 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005958 else if (!cased && Py_UNICODE_ISUPPER(ch))
5959 cased = 1;
5960 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00005961 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005962}
5963
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005964PyDoc_STRVAR(istitle__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005965"S.istitle() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005966\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00005967Return True if S is a titlecased string and there is at least one\n\
5968character in S, i.e. upper- and titlecase characters may only\n\
5969follow uncased characters and lowercase characters only cased ones.\n\
5970Return False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005971
5972static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005973unicode_istitle(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005974{
5975 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5976 register const Py_UNICODE *e;
5977 int cased, previous_is_cased;
5978
Guido van Rossumd57fd912000-03-10 22:53:23 +00005979 /* Shortcut for single character strings */
5980 if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005981 return PyBool_FromLong((Py_UNICODE_ISTITLE(*p) != 0) ||
5982 (Py_UNICODE_ISUPPER(*p) != 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +00005983
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005984 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00005985 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005986 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005987
Guido van Rossumd57fd912000-03-10 22:53:23 +00005988 e = p + PyUnicode_GET_SIZE(self);
5989 cased = 0;
5990 previous_is_cased = 0;
5991 for (; p < e; p++) {
5992 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00005993
Guido van Rossumd57fd912000-03-10 22:53:23 +00005994 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
5995 if (previous_is_cased)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005996 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005997 previous_is_cased = 1;
5998 cased = 1;
5999 }
6000 else if (Py_UNICODE_ISLOWER(ch)) {
6001 if (!previous_is_cased)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006002 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006003 previous_is_cased = 1;
6004 cased = 1;
6005 }
6006 else
6007 previous_is_cased = 0;
6008 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006009 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006010}
6011
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006012PyDoc_STRVAR(isspace__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006013"S.isspace() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006014\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006015Return True if all characters in S are whitespace\n\
6016and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006017
6018static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006019unicode_isspace(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006020{
6021 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6022 register const Py_UNICODE *e;
6023
Guido van Rossumd57fd912000-03-10 22:53:23 +00006024 /* Shortcut for single character strings */
6025 if (PyUnicode_GET_SIZE(self) == 1 &&
6026 Py_UNICODE_ISSPACE(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006027 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006028
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006029 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006030 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006031 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006032
Guido van Rossumd57fd912000-03-10 22:53:23 +00006033 e = p + PyUnicode_GET_SIZE(self);
6034 for (; p < e; p++) {
6035 if (!Py_UNICODE_ISSPACE(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006036 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006037 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006038 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006039}
6040
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006041PyDoc_STRVAR(isalpha__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006042"S.isalpha() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006043\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006044Return True if all characters in S are alphabetic\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006045and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006046
6047static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006048unicode_isalpha(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006049{
6050 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6051 register const Py_UNICODE *e;
6052
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006053 /* Shortcut for single character strings */
6054 if (PyUnicode_GET_SIZE(self) == 1 &&
6055 Py_UNICODE_ISALPHA(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006056 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006057
6058 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006059 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006060 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006061
6062 e = p + PyUnicode_GET_SIZE(self);
6063 for (; p < e; p++) {
6064 if (!Py_UNICODE_ISALPHA(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006065 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006066 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006067 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006068}
6069
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006070PyDoc_STRVAR(isalnum__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006071"S.isalnum() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006072\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006073Return True if all characters in S are alphanumeric\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006074and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006075
6076static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006077unicode_isalnum(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006078{
6079 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6080 register const Py_UNICODE *e;
6081
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006082 /* Shortcut for single character strings */
6083 if (PyUnicode_GET_SIZE(self) == 1 &&
6084 Py_UNICODE_ISALNUM(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006085 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006086
6087 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006088 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006089 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006090
6091 e = p + PyUnicode_GET_SIZE(self);
6092 for (; p < e; p++) {
6093 if (!Py_UNICODE_ISALNUM(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006094 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006095 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006096 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006097}
6098
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006099PyDoc_STRVAR(isdecimal__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006100"S.isdecimal() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006101\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00006102Return True if there are only decimal characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006103False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006104
6105static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006106unicode_isdecimal(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006107{
6108 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6109 register const Py_UNICODE *e;
6110
Guido van Rossumd57fd912000-03-10 22:53:23 +00006111 /* Shortcut for single character strings */
6112 if (PyUnicode_GET_SIZE(self) == 1 &&
6113 Py_UNICODE_ISDECIMAL(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006114 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006115
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006116 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006117 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006118 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006119
Guido van Rossumd57fd912000-03-10 22:53:23 +00006120 e = p + PyUnicode_GET_SIZE(self);
6121 for (; p < e; p++) {
6122 if (!Py_UNICODE_ISDECIMAL(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006123 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006124 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006125 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006126}
6127
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006128PyDoc_STRVAR(isdigit__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006129"S.isdigit() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006130\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006131Return True if all characters in S are digits\n\
6132and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006133
6134static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006135unicode_isdigit(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006136{
6137 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6138 register const Py_UNICODE *e;
6139
Guido van Rossumd57fd912000-03-10 22:53:23 +00006140 /* Shortcut for single character strings */
6141 if (PyUnicode_GET_SIZE(self) == 1 &&
6142 Py_UNICODE_ISDIGIT(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006143 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006144
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006145 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006146 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006147 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006148
Guido van Rossumd57fd912000-03-10 22:53:23 +00006149 e = p + PyUnicode_GET_SIZE(self);
6150 for (; p < e; p++) {
6151 if (!Py_UNICODE_ISDIGIT(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006152 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006153 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006154 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006155}
6156
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006157PyDoc_STRVAR(isnumeric__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006158"S.isnumeric() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006159\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00006160Return True if there are only numeric characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006161False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006162
6163static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006164unicode_isnumeric(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006165{
6166 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6167 register const Py_UNICODE *e;
6168
Guido van Rossumd57fd912000-03-10 22:53:23 +00006169 /* Shortcut for single character strings */
6170 if (PyUnicode_GET_SIZE(self) == 1 &&
6171 Py_UNICODE_ISNUMERIC(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006172 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006173
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006174 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006175 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006176 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006177
Guido van Rossumd57fd912000-03-10 22:53:23 +00006178 e = p + PyUnicode_GET_SIZE(self);
6179 for (; p < e; p++) {
6180 if (!Py_UNICODE_ISNUMERIC(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006181 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006182 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006183 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006184}
6185
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006186PyDoc_STRVAR(join__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006187"S.join(sequence) -> unicode\n\
6188\n\
6189Return a string which is the concatenation of the strings in the\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006190sequence. The separator between elements is S.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006191
6192static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006193unicode_join(PyObject *self, PyObject *data)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006194{
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006195 return PyUnicode_Join(self, data);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006196}
6197
Martin v. Löwis18e16552006-02-15 17:27:45 +00006198static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +00006199unicode_length(PyUnicodeObject *self)
6200{
6201 return self->length;
6202}
6203
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006204PyDoc_STRVAR(ljust__doc__,
Hye-Shik Chang974ed7c2004-06-02 16:49:17 +00006205"S.ljust(width[, fillchar]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006206\n\
6207Return S left justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006208done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006209
6210static PyObject *
6211unicode_ljust(PyUnicodeObject *self, PyObject *args)
6212{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006213 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006214 Py_UNICODE fillchar = ' ';
6215
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006216 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006217 return NULL;
6218
Tim Peters7a29bd52001-09-12 03:03:31 +00006219 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006220 Py_INCREF(self);
6221 return (PyObject*) self;
6222 }
6223
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006224 return (PyObject*) pad(self, 0, width - self->length, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006225}
6226
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006227PyDoc_STRVAR(lower__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006228"S.lower() -> unicode\n\
6229\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006230Return a copy of the string S converted to lowercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006231
6232static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006233unicode_lower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006234{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006235 return fixup(self, fixlower);
6236}
6237
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006238#define LEFTSTRIP 0
6239#define RIGHTSTRIP 1
6240#define BOTHSTRIP 2
6241
6242/* Arrays indexed by above */
6243static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
6244
6245#define STRIPNAME(i) (stripformat[i]+3)
6246
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006247/* externally visible for str.strip(unicode) */
6248PyObject *
6249_PyUnicode_XStrip(PyUnicodeObject *self, int striptype, PyObject *sepobj)
6250{
6251 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
Martin v. Löwis18e16552006-02-15 17:27:45 +00006252 Py_ssize_t len = PyUnicode_GET_SIZE(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006253 Py_UNICODE *sep = PyUnicode_AS_UNICODE(sepobj);
Martin v. Löwis18e16552006-02-15 17:27:45 +00006254 Py_ssize_t seplen = PyUnicode_GET_SIZE(sepobj);
6255 Py_ssize_t i, j;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006256
Thomas Wouters477c8d52006-05-27 19:21:47 +00006257 BLOOM_MASK sepmask = make_bloom_mask(sep, seplen);
6258
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006259 i = 0;
6260 if (striptype != RIGHTSTRIP) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00006261 while (i < len && BLOOM_MEMBER(sepmask, s[i], sep, seplen)) {
6262 i++;
6263 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006264 }
6265
6266 j = len;
6267 if (striptype != LEFTSTRIP) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00006268 do {
6269 j--;
6270 } while (j >= i && BLOOM_MEMBER(sepmask, s[j], sep, seplen));
6271 j++;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006272 }
6273
6274 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00006275 Py_INCREF(self);
6276 return (PyObject*)self;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006277 }
6278 else
Thomas Wouters477c8d52006-05-27 19:21:47 +00006279 return PyUnicode_FromUnicode(s+i, j-i);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006280}
6281
Guido van Rossumd57fd912000-03-10 22:53:23 +00006282
6283static PyObject *
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006284do_strip(PyUnicodeObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006285{
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006286 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
Martin v. Löwis18e16552006-02-15 17:27:45 +00006287 Py_ssize_t len = PyUnicode_GET_SIZE(self), i, j;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006288
6289 i = 0;
6290 if (striptype != RIGHTSTRIP) {
6291 while (i < len && Py_UNICODE_ISSPACE(s[i])) {
6292 i++;
6293 }
6294 }
6295
6296 j = len;
6297 if (striptype != LEFTSTRIP) {
6298 do {
6299 j--;
6300 } while (j >= i && Py_UNICODE_ISSPACE(s[j]));
6301 j++;
6302 }
6303
6304 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
6305 Py_INCREF(self);
6306 return (PyObject*)self;
6307 }
6308 else
6309 return PyUnicode_FromUnicode(s+i, j-i);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006310}
6311
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006312
6313static PyObject *
6314do_argstrip(PyUnicodeObject *self, int striptype, PyObject *args)
6315{
6316 PyObject *sep = NULL;
6317
6318 if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
6319 return NULL;
6320
6321 if (sep != NULL && sep != Py_None) {
6322 if (PyUnicode_Check(sep))
6323 return _PyUnicode_XStrip(self, striptype, sep);
6324 else if (PyString_Check(sep)) {
6325 PyObject *res;
6326 sep = PyUnicode_FromObject(sep);
6327 if (sep==NULL)
6328 return NULL;
6329 res = _PyUnicode_XStrip(self, striptype, sep);
6330 Py_DECREF(sep);
6331 return res;
6332 }
6333 else {
6334 PyErr_Format(PyExc_TypeError,
6335 "%s arg must be None, unicode or str",
6336 STRIPNAME(striptype));
6337 return NULL;
6338 }
6339 }
6340
6341 return do_strip(self, striptype);
6342}
6343
6344
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006345PyDoc_STRVAR(strip__doc__,
Neal Norwitzffe33b72003-04-10 22:35:32 +00006346"S.strip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006347\n\
6348Return a copy of the string S with leading and trailing\n\
6349whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00006350If chars is given and not None, remove characters in chars instead.\n\
6351If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006352
6353static PyObject *
6354unicode_strip(PyUnicodeObject *self, PyObject *args)
6355{
6356 if (PyTuple_GET_SIZE(args) == 0)
6357 return do_strip(self, BOTHSTRIP); /* Common case */
6358 else
6359 return do_argstrip(self, BOTHSTRIP, args);
6360}
6361
6362
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006363PyDoc_STRVAR(lstrip__doc__,
Neal Norwitzffe33b72003-04-10 22:35:32 +00006364"S.lstrip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006365\n\
6366Return a copy of the string S with leading whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00006367If chars is given and not None, remove characters in chars instead.\n\
6368If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006369
6370static PyObject *
6371unicode_lstrip(PyUnicodeObject *self, PyObject *args)
6372{
6373 if (PyTuple_GET_SIZE(args) == 0)
6374 return do_strip(self, LEFTSTRIP); /* Common case */
6375 else
6376 return do_argstrip(self, LEFTSTRIP, args);
6377}
6378
6379
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006380PyDoc_STRVAR(rstrip__doc__,
Neal Norwitzffe33b72003-04-10 22:35:32 +00006381"S.rstrip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006382\n\
6383Return a copy of the string S with trailing whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00006384If chars is given and not None, remove characters in chars instead.\n\
6385If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006386
6387static PyObject *
6388unicode_rstrip(PyUnicodeObject *self, PyObject *args)
6389{
6390 if (PyTuple_GET_SIZE(args) == 0)
6391 return do_strip(self, RIGHTSTRIP); /* Common case */
6392 else
6393 return do_argstrip(self, RIGHTSTRIP, args);
6394}
6395
6396
Guido van Rossumd57fd912000-03-10 22:53:23 +00006397static PyObject*
Martin v. Löwis18e16552006-02-15 17:27:45 +00006398unicode_repeat(PyUnicodeObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006399{
6400 PyUnicodeObject *u;
6401 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006402 Py_ssize_t nchars;
Tim Peters8f422462000-09-09 06:13:41 +00006403 size_t nbytes;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006404
6405 if (len < 0)
6406 len = 0;
6407
Tim Peters7a29bd52001-09-12 03:03:31 +00006408 if (len == 1 && PyUnicode_CheckExact(str)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006409 /* no repeat, return original string */
6410 Py_INCREF(str);
6411 return (PyObject*) str;
6412 }
Tim Peters8f422462000-09-09 06:13:41 +00006413
6414 /* ensure # of chars needed doesn't overflow int and # of bytes
6415 * needed doesn't overflow size_t
6416 */
6417 nchars = len * str->length;
6418 if (len && nchars / len != str->length) {
6419 PyErr_SetString(PyExc_OverflowError,
6420 "repeated string is too long");
6421 return NULL;
6422 }
6423 nbytes = (nchars + 1) * sizeof(Py_UNICODE);
6424 if (nbytes / sizeof(Py_UNICODE) != (size_t)(nchars + 1)) {
6425 PyErr_SetString(PyExc_OverflowError,
6426 "repeated string is too long");
6427 return NULL;
6428 }
6429 u = _PyUnicode_New(nchars);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006430 if (!u)
6431 return NULL;
6432
6433 p = u->str;
6434
Thomas Wouters477c8d52006-05-27 19:21:47 +00006435 if (str->length == 1 && len > 0) {
6436 Py_UNICODE_FILL(p, str->str[0], len);
6437 } else {
6438 Py_ssize_t done = 0; /* number of characters copied this far */
6439 if (done < nchars) {
6440 Py_UNICODE_COPY(p, str->str, str->length);
6441 done = str->length;
6442 }
6443 while (done < nchars) {
6444 int n = (done <= nchars-done) ? done : nchars-done;
6445 Py_UNICODE_COPY(p+done, p, n);
6446 done += n;
6447 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006448 }
6449
6450 return (PyObject*) u;
6451}
6452
6453PyObject *PyUnicode_Replace(PyObject *obj,
6454 PyObject *subobj,
6455 PyObject *replobj,
Martin v. Löwis18e16552006-02-15 17:27:45 +00006456 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006457{
6458 PyObject *self;
6459 PyObject *str1;
6460 PyObject *str2;
6461 PyObject *result;
6462
6463 self = PyUnicode_FromObject(obj);
6464 if (self == NULL)
6465 return NULL;
6466 str1 = PyUnicode_FromObject(subobj);
6467 if (str1 == NULL) {
6468 Py_DECREF(self);
6469 return NULL;
6470 }
6471 str2 = PyUnicode_FromObject(replobj);
6472 if (str2 == NULL) {
6473 Py_DECREF(self);
6474 Py_DECREF(str1);
6475 return NULL;
6476 }
Tim Petersced69f82003-09-16 20:30:58 +00006477 result = replace((PyUnicodeObject *)self,
6478 (PyUnicodeObject *)str1,
6479 (PyUnicodeObject *)str2,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006480 maxcount);
6481 Py_DECREF(self);
6482 Py_DECREF(str1);
6483 Py_DECREF(str2);
6484 return result;
6485}
6486
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006487PyDoc_STRVAR(replace__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006488"S.replace (old, new[, maxsplit]) -> unicode\n\
6489\n\
6490Return a copy of S with all occurrences of substring\n\
6491old replaced by new. If the optional argument maxsplit is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006492given, only the first maxsplit occurrences are replaced.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006493
6494static PyObject*
6495unicode_replace(PyUnicodeObject *self, PyObject *args)
6496{
6497 PyUnicodeObject *str1;
6498 PyUnicodeObject *str2;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006499 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006500 PyObject *result;
6501
Martin v. Löwis18e16552006-02-15 17:27:45 +00006502 if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006503 return NULL;
6504 str1 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str1);
6505 if (str1 == NULL)
6506 return NULL;
6507 str2 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str2);
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00006508 if (str2 == NULL) {
6509 Py_DECREF(str1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006510 return NULL;
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00006511 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006512
6513 result = replace(self, str1, str2, maxcount);
6514
6515 Py_DECREF(str1);
6516 Py_DECREF(str2);
6517 return result;
6518}
6519
6520static
6521PyObject *unicode_repr(PyObject *unicode)
6522{
6523 return unicodeescape_string(PyUnicode_AS_UNICODE(unicode),
6524 PyUnicode_GET_SIZE(unicode),
6525 1);
6526}
6527
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006528PyDoc_STRVAR(rfind__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006529"S.rfind(sub [,start [,end]]) -> int\n\
6530\n\
6531Return the highest index in S where substring sub is found,\n\
6532such that sub is contained within s[start,end]. Optional\n\
6533arguments start and end are interpreted as in slice notation.\n\
6534\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006535Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006536
6537static PyObject *
6538unicode_rfind(PyUnicodeObject *self, PyObject *args)
6539{
Thomas Wouters477c8d52006-05-27 19:21:47 +00006540 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006541 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006542 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006543 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006544
Guido van Rossumb8872e62000-05-09 14:14:27 +00006545 if (!PyArg_ParseTuple(args, "O|O&O&:rfind", &substring,
6546 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006547 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006548 substring = PyUnicode_FromObject(substring);
6549 if (!substring)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006550 return NULL;
6551
Thomas Wouters477c8d52006-05-27 19:21:47 +00006552 result = stringlib_rfind_slice(
6553 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
6554 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
6555 start, end
6556 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006557
6558 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00006559
6560 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006561}
6562
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006563PyDoc_STRVAR(rindex__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006564"S.rindex(sub [,start [,end]]) -> int\n\
6565\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006566Like S.rfind() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006567
6568static PyObject *
6569unicode_rindex(PyUnicodeObject *self, PyObject *args)
6570{
Thomas Wouters477c8d52006-05-27 19:21:47 +00006571 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006572 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006573 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006574 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006575
Guido van Rossumb8872e62000-05-09 14:14:27 +00006576 if (!PyArg_ParseTuple(args, "O|O&O&:rindex", &substring,
6577 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006578 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006579 substring = PyUnicode_FromObject(substring);
6580 if (!substring)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006581 return NULL;
6582
Thomas Wouters477c8d52006-05-27 19:21:47 +00006583 result = stringlib_rfind_slice(
6584 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
6585 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
6586 start, end
6587 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006588
6589 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00006590
Guido van Rossumd57fd912000-03-10 22:53:23 +00006591 if (result < 0) {
6592 PyErr_SetString(PyExc_ValueError, "substring not found");
6593 return NULL;
6594 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00006595 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006596}
6597
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006598PyDoc_STRVAR(rjust__doc__,
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006599"S.rjust(width[, fillchar]) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006600\n\
6601Return S right justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006602done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006603
6604static PyObject *
6605unicode_rjust(PyUnicodeObject *self, PyObject *args)
6606{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006607 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006608 Py_UNICODE fillchar = ' ';
6609
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006610 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006611 return NULL;
6612
Tim Peters7a29bd52001-09-12 03:03:31 +00006613 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006614 Py_INCREF(self);
6615 return (PyObject*) self;
6616 }
6617
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006618 return (PyObject*) pad(self, width - self->length, 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006619}
6620
Guido van Rossumd57fd912000-03-10 22:53:23 +00006621static PyObject*
Martin v. Löwis18e16552006-02-15 17:27:45 +00006622unicode_slice(PyUnicodeObject *self, Py_ssize_t start, Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006623{
6624 /* standard clamping */
6625 if (start < 0)
6626 start = 0;
6627 if (end < 0)
6628 end = 0;
6629 if (end > self->length)
6630 end = self->length;
Tim Peters7a29bd52001-09-12 03:03:31 +00006631 if (start == 0 && end == self->length && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006632 /* full slice, return original string */
6633 Py_INCREF(self);
6634 return (PyObject*) self;
6635 }
6636 if (start > end)
6637 start = end;
6638 /* copy slice */
6639 return (PyObject*) PyUnicode_FromUnicode(self->str + start,
6640 end - start);
6641}
6642
6643PyObject *PyUnicode_Split(PyObject *s,
6644 PyObject *sep,
Martin v. Löwis18e16552006-02-15 17:27:45 +00006645 Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006646{
6647 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00006648
Guido van Rossumd57fd912000-03-10 22:53:23 +00006649 s = PyUnicode_FromObject(s);
6650 if (s == NULL)
6651 return NULL;
6652 if (sep != NULL) {
6653 sep = PyUnicode_FromObject(sep);
6654 if (sep == NULL) {
6655 Py_DECREF(s);
6656 return NULL;
6657 }
6658 }
6659
6660 result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
6661
6662 Py_DECREF(s);
6663 Py_XDECREF(sep);
6664 return result;
6665}
6666
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006667PyDoc_STRVAR(split__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006668"S.split([sep [,maxsplit]]) -> list of strings\n\
6669\n\
6670Return a list of the words in S, using sep as the\n\
6671delimiter string. If maxsplit is given, at most maxsplit\n\
Thomas Hellerca0d2cb2004-09-15 11:41:32 +00006672splits are done. If sep is not specified or is None,\n\
Walter Dörwald782afc52004-09-14 09:40:45 +00006673any whitespace string is a separator.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006674
6675static PyObject*
6676unicode_split(PyUnicodeObject *self, PyObject *args)
6677{
6678 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006679 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006680
Martin v. Löwis18e16552006-02-15 17:27:45 +00006681 if (!PyArg_ParseTuple(args, "|On:split", &substring, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006682 return NULL;
6683
6684 if (substring == Py_None)
6685 return split(self, NULL, maxcount);
6686 else if (PyUnicode_Check(substring))
6687 return split(self, (PyUnicodeObject *)substring, maxcount);
6688 else
6689 return PyUnicode_Split((PyObject *)self, substring, maxcount);
6690}
6691
Thomas Wouters477c8d52006-05-27 19:21:47 +00006692PyObject *
6693PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
6694{
6695 PyObject* str_obj;
6696 PyObject* sep_obj;
6697 PyObject* out;
6698
6699 str_obj = PyUnicode_FromObject(str_in);
6700 if (!str_obj)
6701 return NULL;
6702 sep_obj = PyUnicode_FromObject(sep_in);
6703 if (!sep_obj) {
6704 Py_DECREF(str_obj);
6705 return NULL;
6706 }
6707
6708 out = stringlib_partition(
6709 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
6710 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
6711 );
6712
6713 Py_DECREF(sep_obj);
6714 Py_DECREF(str_obj);
6715
6716 return out;
6717}
6718
6719
6720PyObject *
6721PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
6722{
6723 PyObject* str_obj;
6724 PyObject* sep_obj;
6725 PyObject* out;
6726
6727 str_obj = PyUnicode_FromObject(str_in);
6728 if (!str_obj)
6729 return NULL;
6730 sep_obj = PyUnicode_FromObject(sep_in);
6731 if (!sep_obj) {
6732 Py_DECREF(str_obj);
6733 return NULL;
6734 }
6735
6736 out = stringlib_rpartition(
6737 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
6738 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
6739 );
6740
6741 Py_DECREF(sep_obj);
6742 Py_DECREF(str_obj);
6743
6744 return out;
6745}
6746
6747PyDoc_STRVAR(partition__doc__,
6748"S.partition(sep) -> (head, sep, tail)\n\
6749\n\
6750Searches for the separator sep in S, and returns the part before it,\n\
6751the separator itself, and the part after it. If the separator is not\n\
6752found, returns S and two empty strings.");
6753
6754static PyObject*
6755unicode_partition(PyUnicodeObject *self, PyObject *separator)
6756{
6757 return PyUnicode_Partition((PyObject *)self, separator);
6758}
6759
6760PyDoc_STRVAR(rpartition__doc__,
Thomas Wouters89f507f2006-12-13 04:49:30 +00006761"S.rpartition(sep) -> (tail, sep, head)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00006762\n\
6763Searches for the separator sep in S, starting at the end of S, and returns\n\
6764the part before it, the separator itself, and the part after it. If the\n\
Thomas Wouters89f507f2006-12-13 04:49:30 +00006765separator is not found, returns two empty strings and S.");
Thomas Wouters477c8d52006-05-27 19:21:47 +00006766
6767static PyObject*
6768unicode_rpartition(PyUnicodeObject *self, PyObject *separator)
6769{
6770 return PyUnicode_RPartition((PyObject *)self, separator);
6771}
6772
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006773PyObject *PyUnicode_RSplit(PyObject *s,
6774 PyObject *sep,
Martin v. Löwis18e16552006-02-15 17:27:45 +00006775 Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006776{
6777 PyObject *result;
6778
6779 s = PyUnicode_FromObject(s);
6780 if (s == NULL)
6781 return NULL;
6782 if (sep != NULL) {
6783 sep = PyUnicode_FromObject(sep);
6784 if (sep == NULL) {
6785 Py_DECREF(s);
6786 return NULL;
6787 }
6788 }
6789
6790 result = rsplit((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
6791
6792 Py_DECREF(s);
6793 Py_XDECREF(sep);
6794 return result;
6795}
6796
6797PyDoc_STRVAR(rsplit__doc__,
6798"S.rsplit([sep [,maxsplit]]) -> list of strings\n\
6799\n\
6800Return a list of the words in S, using sep as the\n\
6801delimiter string, starting at the end of the string and\n\
6802working to the front. If maxsplit is given, at most maxsplit\n\
6803splits are done. If sep is not specified, any whitespace string\n\
6804is a separator.");
6805
6806static PyObject*
6807unicode_rsplit(PyUnicodeObject *self, PyObject *args)
6808{
6809 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006810 Py_ssize_t maxcount = -1;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006811
Martin v. Löwis18e16552006-02-15 17:27:45 +00006812 if (!PyArg_ParseTuple(args, "|On:rsplit", &substring, &maxcount))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006813 return NULL;
6814
6815 if (substring == Py_None)
6816 return rsplit(self, NULL, maxcount);
6817 else if (PyUnicode_Check(substring))
6818 return rsplit(self, (PyUnicodeObject *)substring, maxcount);
6819 else
6820 return PyUnicode_RSplit((PyObject *)self, substring, maxcount);
6821}
6822
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006823PyDoc_STRVAR(splitlines__doc__,
Guido van Rossum86662912000-04-11 15:38:46 +00006824"S.splitlines([keepends]]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006825\n\
6826Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +00006827Line breaks are not included in the resulting list unless keepends\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006828is given and true.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006829
6830static PyObject*
6831unicode_splitlines(PyUnicodeObject *self, PyObject *args)
6832{
Guido van Rossum86662912000-04-11 15:38:46 +00006833 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006834
Guido van Rossum86662912000-04-11 15:38:46 +00006835 if (!PyArg_ParseTuple(args, "|i:splitlines", &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006836 return NULL;
6837
Guido van Rossum86662912000-04-11 15:38:46 +00006838 return PyUnicode_Splitlines((PyObject *)self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006839}
6840
6841static
Guido van Rossumf15a29f2007-05-04 00:41:39 +00006842PyObject *unicode_str(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006843{
Guido van Rossumf15a29f2007-05-04 00:41:39 +00006844 PyObject *res = _PyUnicode_AsDefaultEncodedString(self, NULL);
6845 Py_XINCREF(res);
6846 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006847}
6848
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006849PyDoc_STRVAR(swapcase__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006850"S.swapcase() -> unicode\n\
6851\n\
6852Return a copy of S with uppercase characters converted to lowercase\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006853and vice versa.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006854
6855static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006856unicode_swapcase(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006857{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006858 return fixup(self, fixswapcase);
6859}
6860
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006861PyDoc_STRVAR(translate__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006862"S.translate(table) -> unicode\n\
6863\n\
6864Return a copy of the string S, where all characters have been mapped\n\
6865through the given translation table, which must be a mapping of\n\
Walter Dörwald5c1ee172002-09-04 20:31:32 +00006866Unicode ordinals to Unicode ordinals, Unicode strings or None.\n\
6867Unmapped characters are left untouched. Characters mapped to None\n\
6868are deleted.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006869
6870static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006871unicode_translate(PyUnicodeObject *self, PyObject *table)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006872{
Tim Petersced69f82003-09-16 20:30:58 +00006873 return PyUnicode_TranslateCharmap(self->str,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006874 self->length,
Tim Petersced69f82003-09-16 20:30:58 +00006875 table,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006876 "ignore");
6877}
6878
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006879PyDoc_STRVAR(upper__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006880"S.upper() -> unicode\n\
6881\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006882Return a copy of S converted to uppercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006883
6884static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006885unicode_upper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006886{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006887 return fixup(self, fixupper);
6888}
6889
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006890PyDoc_STRVAR(zfill__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006891"S.zfill(width) -> unicode\n\
6892\n\
6893Pad a numeric string x with zeros on the left, to fill a field\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006894of the specified width. The string x is never truncated.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006895
6896static PyObject *
6897unicode_zfill(PyUnicodeObject *self, PyObject *args)
6898{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006899 Py_ssize_t fill;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006900 PyUnicodeObject *u;
6901
Martin v. Löwis18e16552006-02-15 17:27:45 +00006902 Py_ssize_t width;
6903 if (!PyArg_ParseTuple(args, "n:zfill", &width))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006904 return NULL;
6905
6906 if (self->length >= width) {
Walter Dörwald0fe940c2002-04-15 18:42:15 +00006907 if (PyUnicode_CheckExact(self)) {
6908 Py_INCREF(self);
6909 return (PyObject*) self;
6910 }
6911 else
6912 return PyUnicode_FromUnicode(
6913 PyUnicode_AS_UNICODE(self),
6914 PyUnicode_GET_SIZE(self)
6915 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006916 }
6917
6918 fill = width - self->length;
6919
6920 u = pad(self, fill, 0, '0');
6921
Walter Dörwald068325e2002-04-15 13:36:47 +00006922 if (u == NULL)
6923 return NULL;
6924
Guido van Rossumd57fd912000-03-10 22:53:23 +00006925 if (u->str[fill] == '+' || u->str[fill] == '-') {
6926 /* move sign to beginning of string */
6927 u->str[0] = u->str[fill];
6928 u->str[fill] = '0';
6929 }
6930
6931 return (PyObject*) u;
6932}
Guido van Rossumd57fd912000-03-10 22:53:23 +00006933
6934#if 0
6935static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006936unicode_freelistsize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006937{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006938 return PyInt_FromLong(unicode_freelist_size);
6939}
6940#endif
6941
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006942PyDoc_STRVAR(startswith__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006943"S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006944\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00006945Return True if S starts with the specified prefix, False otherwise.\n\
6946With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006947With optional end, stop comparing S at that position.\n\
6948prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006949
6950static PyObject *
6951unicode_startswith(PyUnicodeObject *self,
6952 PyObject *args)
6953{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006954 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006955 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006956 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006957 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006958 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006959
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006960 if (!PyArg_ParseTuple(args, "O|O&O&:startswith", &subobj,
Guido van Rossumb8872e62000-05-09 14:14:27 +00006961 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006962 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006963 if (PyTuple_Check(subobj)) {
6964 Py_ssize_t i;
6965 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
6966 substring = (PyUnicodeObject *)PyUnicode_FromObject(
6967 PyTuple_GET_ITEM(subobj, i));
6968 if (substring == NULL)
6969 return NULL;
6970 result = tailmatch(self, substring, start, end, -1);
6971 Py_DECREF(substring);
6972 if (result) {
6973 Py_RETURN_TRUE;
6974 }
6975 }
6976 /* nothing matched */
6977 Py_RETURN_FALSE;
6978 }
6979 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006980 if (substring == NULL)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006981 return NULL;
6982 result = tailmatch(self, substring, start, end, -1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006983 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006984 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006985}
6986
6987
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006988PyDoc_STRVAR(endswith__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006989"S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006990\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00006991Return True if S ends with the specified suffix, False otherwise.\n\
6992With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006993With optional end, stop comparing S at that position.\n\
6994suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006995
6996static PyObject *
6997unicode_endswith(PyUnicodeObject *self,
6998 PyObject *args)
6999{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007000 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007001 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007002 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007003 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007004 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007005
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007006 if (!PyArg_ParseTuple(args, "O|O&O&:endswith", &subobj,
7007 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007008 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007009 if (PyTuple_Check(subobj)) {
7010 Py_ssize_t i;
7011 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
7012 substring = (PyUnicodeObject *)PyUnicode_FromObject(
7013 PyTuple_GET_ITEM(subobj, i));
7014 if (substring == NULL)
7015 return NULL;
7016 result = tailmatch(self, substring, start, end, +1);
7017 Py_DECREF(substring);
7018 if (result) {
7019 Py_RETURN_TRUE;
7020 }
7021 }
7022 Py_RETURN_FALSE;
7023 }
7024 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007025 if (substring == NULL)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007026 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007027
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007028 result = tailmatch(self, substring, start, end, +1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007029 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007030 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007031}
7032
7033
Guido van Rossum5d9113d2003-01-29 17:58:45 +00007034
7035static PyObject *
7036unicode_getnewargs(PyUnicodeObject *v)
7037{
7038 return Py_BuildValue("(u#)", v->str, v->length);
7039}
7040
7041
Guido van Rossumd57fd912000-03-10 22:53:23 +00007042static PyMethodDef unicode_methods[] = {
7043
7044 /* Order is according to common usage: often used methods should
7045 appear first, since lookup is done sequentially. */
7046
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007047 {"encode", (PyCFunction) unicode_encode, METH_VARARGS, encode__doc__},
7048 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
7049 {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007050 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007051 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
7052 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
7053 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
7054 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
7055 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
7056 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
7057 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +00007058 {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007059 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
7060 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
7061 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007062 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00007063 {"decode", (PyCFunction) unicode_decode, METH_VARARGS, decode__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007064/* {"maketrans", (PyCFunction) unicode_maketrans, METH_VARARGS, maketrans__doc__}, */
7065 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
7066 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
7067 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007068 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +00007069 {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007070 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS, splitlines__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007071 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007072 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
7073 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
7074 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
7075 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
7076 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
7077 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
7078 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
7079 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
7080 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
7081 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
7082 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
7083 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
7084 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
7085 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007086 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
Walter Dörwald068325e2002-04-15 13:36:47 +00007087#if 0
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007088 {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
Guido van Rossumd57fd912000-03-10 22:53:23 +00007089#endif
7090
7091#if 0
7092 /* This one is just used for debugging the implementation. */
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007093 {"freelistsize", (PyCFunction) unicode_freelistsize, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00007094#endif
7095
Guido van Rossum5d9113d2003-01-29 17:58:45 +00007096 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00007097 {NULL, NULL}
7098};
7099
Neil Schemenauerce30bc92002-11-18 16:10:18 +00007100static PyObject *
7101unicode_mod(PyObject *v, PyObject *w)
7102{
7103 if (!PyUnicode_Check(v)) {
7104 Py_INCREF(Py_NotImplemented);
7105 return Py_NotImplemented;
7106 }
7107 return PyUnicode_Format(v, w);
7108}
7109
7110static PyNumberMethods unicode_as_number = {
7111 0, /*nb_add*/
7112 0, /*nb_subtract*/
7113 0, /*nb_multiply*/
Neil Schemenauerce30bc92002-11-18 16:10:18 +00007114 unicode_mod, /*nb_remainder*/
7115};
7116
Guido van Rossumd57fd912000-03-10 22:53:23 +00007117static PySequenceMethods unicode_as_sequence = {
Martin v. Löwis18e16552006-02-15 17:27:45 +00007118 (lenfunc) unicode_length, /* sq_length */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007119 PyUnicode_Concat, /* sq_concat */
Martin v. Löwis18e16552006-02-15 17:27:45 +00007120 (ssizeargfunc) unicode_repeat, /* sq_repeat */
7121 (ssizeargfunc) unicode_getitem, /* sq_item */
7122 (ssizessizeargfunc) unicode_slice, /* sq_slice */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007123 0, /* sq_ass_item */
7124 0, /* sq_ass_slice */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007125 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007126};
7127
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007128static PyObject*
7129unicode_subscript(PyUnicodeObject* self, PyObject* item)
7130{
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00007131 if (PyIndex_Check(item)) {
7132 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007133 if (i == -1 && PyErr_Occurred())
7134 return NULL;
7135 if (i < 0)
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007136 i += PyUnicode_GET_SIZE(self);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007137 return unicode_getitem(self, i);
7138 } else if (PySlice_Check(item)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00007139 Py_ssize_t start, stop, step, slicelength, cur, i;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007140 Py_UNICODE* source_buf;
7141 Py_UNICODE* result_buf;
7142 PyObject* result;
7143
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007144 if (PySlice_GetIndicesEx((PySliceObject*)item, PyUnicode_GET_SIZE(self),
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007145 &start, &stop, &step, &slicelength) < 0) {
7146 return NULL;
7147 }
7148
7149 if (slicelength <= 0) {
7150 return PyUnicode_FromUnicode(NULL, 0);
7151 } else {
7152 source_buf = PyUnicode_AS_UNICODE((PyObject*)self);
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007153 result_buf = (Py_UNICODE *)PyMem_MALLOC(slicelength*
7154 sizeof(Py_UNICODE));
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007155
7156 if (result_buf == NULL)
7157 return PyErr_NoMemory();
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007158
7159 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
7160 result_buf[i] = source_buf[cur];
7161 }
Tim Petersced69f82003-09-16 20:30:58 +00007162
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007163 result = PyUnicode_FromUnicode(result_buf, slicelength);
7164 PyMem_FREE(result_buf);
7165 return result;
7166 }
7167 } else {
7168 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
7169 return NULL;
7170 }
7171}
7172
7173static PyMappingMethods unicode_as_mapping = {
Martin v. Löwis18e16552006-02-15 17:27:45 +00007174 (lenfunc)unicode_length, /* mp_length */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007175 (binaryfunc)unicode_subscript, /* mp_subscript */
7176 (objobjargproc)0, /* mp_ass_subscript */
7177};
7178
Martin v. Löwis18e16552006-02-15 17:27:45 +00007179static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +00007180unicode_buffer_getreadbuf(PyUnicodeObject *self,
Martin v. Löwis18e16552006-02-15 17:27:45 +00007181 Py_ssize_t index,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007182 const void **ptr)
7183{
7184 if (index != 0) {
7185 PyErr_SetString(PyExc_SystemError,
7186 "accessing non-existent unicode segment");
7187 return -1;
7188 }
7189 *ptr = (void *) self->str;
7190 return PyUnicode_GET_DATA_SIZE(self);
7191}
7192
Martin v. Löwis18e16552006-02-15 17:27:45 +00007193static Py_ssize_t
7194unicode_buffer_getwritebuf(PyUnicodeObject *self, Py_ssize_t index,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007195 const void **ptr)
7196{
7197 PyErr_SetString(PyExc_TypeError,
Neal Norwitz20e72132002-06-13 21:25:17 +00007198 "cannot use unicode as modifiable buffer");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007199 return -1;
7200}
7201
7202static int
7203unicode_buffer_getsegcount(PyUnicodeObject *self,
Martin v. Löwis18e16552006-02-15 17:27:45 +00007204 Py_ssize_t *lenp)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007205{
7206 if (lenp)
7207 *lenp = PyUnicode_GET_DATA_SIZE(self);
7208 return 1;
7209}
7210
Martin v. Löwiseb079f12006-02-16 14:32:27 +00007211static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +00007212unicode_buffer_getcharbuf(PyUnicodeObject *self,
Martin v. Löwis18e16552006-02-15 17:27:45 +00007213 Py_ssize_t index,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007214 const void **ptr)
7215{
7216 PyObject *str;
Tim Petersced69f82003-09-16 20:30:58 +00007217
Guido van Rossumd57fd912000-03-10 22:53:23 +00007218 if (index != 0) {
7219 PyErr_SetString(PyExc_SystemError,
7220 "accessing non-existent unicode segment");
7221 return -1;
7222 }
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00007223 str = _PyUnicode_AsDefaultEncodedString((PyObject *)self, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007224 if (str == NULL)
7225 return -1;
7226 *ptr = (void *) PyString_AS_STRING(str);
7227 return PyString_GET_SIZE(str);
7228}
7229
7230/* Helpers for PyUnicode_Format() */
7231
7232static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +00007233getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007234{
Martin v. Löwis18e16552006-02-15 17:27:45 +00007235 Py_ssize_t argidx = *p_argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007236 if (argidx < arglen) {
7237 (*p_argidx)++;
7238 if (arglen < 0)
7239 return args;
7240 else
7241 return PyTuple_GetItem(args, argidx);
7242 }
7243 PyErr_SetString(PyExc_TypeError,
7244 "not enough arguments for format string");
7245 return NULL;
7246}
7247
7248#define F_LJUST (1<<0)
7249#define F_SIGN (1<<1)
7250#define F_BLANK (1<<2)
7251#define F_ALT (1<<3)
7252#define F_ZERO (1<<4)
7253
Martin v. Löwis18e16552006-02-15 17:27:45 +00007254static Py_ssize_t
Neal Norwitzfc76d632006-01-10 06:03:13 +00007255strtounicode(Py_UNICODE *buffer, const char *charbuffer)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007256{
Martin v. Löwis18e16552006-02-15 17:27:45 +00007257 register Py_ssize_t i;
7258 Py_ssize_t len = strlen(charbuffer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007259 for (i = len - 1; i >= 0; i--)
7260 buffer[i] = (Py_UNICODE) charbuffer[i];
7261
Guido van Rossumd57fd912000-03-10 22:53:23 +00007262 return len;
7263}
7264
Neal Norwitzfc76d632006-01-10 06:03:13 +00007265static int
7266doubletounicode(Py_UNICODE *buffer, size_t len, const char *format, double x)
7267{
Tim Peters15231542006-02-16 01:08:01 +00007268 Py_ssize_t result;
7269
Neal Norwitzfc76d632006-01-10 06:03:13 +00007270 PyOS_ascii_formatd((char *)buffer, len, format, x);
Tim Peters15231542006-02-16 01:08:01 +00007271 result = strtounicode(buffer, (char *)buffer);
7272 return Py_SAFE_DOWNCAST(result, Py_ssize_t, int);
Neal Norwitzfc76d632006-01-10 06:03:13 +00007273}
7274
7275static int
7276longtounicode(Py_UNICODE *buffer, size_t len, const char *format, long x)
7277{
Tim Peters15231542006-02-16 01:08:01 +00007278 Py_ssize_t result;
7279
Neal Norwitzfc76d632006-01-10 06:03:13 +00007280 PyOS_snprintf((char *)buffer, len, format, x);
Tim Peters15231542006-02-16 01:08:01 +00007281 result = strtounicode(buffer, (char *)buffer);
7282 return Py_SAFE_DOWNCAST(result, Py_ssize_t, int);
Neal Norwitzfc76d632006-01-10 06:03:13 +00007283}
7284
Guido van Rossum078151d2002-08-11 04:24:12 +00007285/* XXX To save some code duplication, formatfloat/long/int could have been
7286 shared with stringobject.c, converting from 8-bit to Unicode after the
7287 formatting is done. */
7288
Guido van Rossumd57fd912000-03-10 22:53:23 +00007289static int
7290formatfloat(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007291 size_t buflen,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007292 int flags,
7293 int prec,
7294 int type,
7295 PyObject *v)
7296{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007297 /* fmt = '%#.' + `prec` + `type`
7298 worst case length = 3 + 10 (len of INT_MAX) + 1 = 14 (use 20)*/
Guido van Rossumd57fd912000-03-10 22:53:23 +00007299 char fmt[20];
7300 double x;
Tim Petersced69f82003-09-16 20:30:58 +00007301
Guido van Rossumd57fd912000-03-10 22:53:23 +00007302 x = PyFloat_AsDouble(v);
7303 if (x == -1.0 && PyErr_Occurred())
7304 return -1;
7305 if (prec < 0)
7306 prec = 6;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007307 if (type == 'f' && (fabs(x) / 1e25) >= 1e25)
7308 type = 'g';
Marc-André Lemburg79f57832002-12-29 19:44:06 +00007309 /* Worst case length calc to ensure no buffer overrun:
7310
7311 'g' formats:
7312 fmt = %#.<prec>g
7313 buf = '-' + [0-9]*prec + '.' + 'e+' + (longest exp
7314 for any double rep.)
7315 len = 1 + prec + 1 + 2 + 5 = 9 + prec
7316
7317 'f' formats:
7318 buf = '-' + [0-9]*x + '.' + [0-9]*prec (with x < 50)
7319 len = 1 + 50 + 1 + prec = 52 + prec
7320
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007321 If prec=0 the effective precision is 1 (the leading digit is
Tim Petersced69f82003-09-16 20:30:58 +00007322 always given), therefore increase the length by one.
Marc-André Lemburg79f57832002-12-29 19:44:06 +00007323
7324 */
7325 if ((type == 'g' && buflen <= (size_t)10 + (size_t)prec) ||
7326 (type == 'f' && buflen <= (size_t)53 + (size_t)prec)) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007327 PyErr_SetString(PyExc_OverflowError,
Marc-André Lemburg79f57832002-12-29 19:44:06 +00007328 "formatted float is too long (precision too large?)");
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007329 return -1;
7330 }
Marc-André Lemburg79f57832002-12-29 19:44:06 +00007331 PyOS_snprintf(fmt, sizeof(fmt), "%%%s.%d%c",
7332 (flags&F_ALT) ? "#" : "",
7333 prec, type);
Neal Norwitzfc76d632006-01-10 06:03:13 +00007334 return doubletounicode(buf, buflen, fmt, x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007335}
7336
Tim Peters38fd5b62000-09-21 05:43:11 +00007337static PyObject*
7338formatlong(PyObject *val, int flags, int prec, int type)
7339{
7340 char *buf;
7341 int i, len;
7342 PyObject *str; /* temporary string object. */
7343 PyUnicodeObject *result;
7344
7345 str = _PyString_FormatLong(val, flags, prec, type, &buf, &len);
7346 if (!str)
7347 return NULL;
7348 result = _PyUnicode_New(len);
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00007349 if (!result) {
7350 Py_DECREF(str);
7351 return NULL;
7352 }
Tim Peters38fd5b62000-09-21 05:43:11 +00007353 for (i = 0; i < len; i++)
7354 result->str[i] = buf[i];
7355 result->str[len] = 0;
7356 Py_DECREF(str);
7357 return (PyObject*)result;
7358}
7359
Guido van Rossumd57fd912000-03-10 22:53:23 +00007360static int
7361formatint(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007362 size_t buflen,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007363 int flags,
7364 int prec,
7365 int type,
7366 PyObject *v)
7367{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007368 /* fmt = '%#.' + `prec` + 'l' + `type`
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00007369 * worst case length = 3 + 19 (worst len of INT_MAX on 64-bit machine)
7370 * + 1 + 1
7371 * = 24
7372 */
Tim Peters38fd5b62000-09-21 05:43:11 +00007373 char fmt[64]; /* plenty big enough! */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00007374 char *sign;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007375 long x;
7376
7377 x = PyInt_AsLong(v);
7378 if (x == -1 && PyErr_Occurred())
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00007379 return -1;
Guido van Rossum6c9e1302003-11-29 23:52:13 +00007380 if (x < 0 && type == 'u') {
7381 type = 'd';
Guido van Rossum078151d2002-08-11 04:24:12 +00007382 }
Guido van Rossum6c9e1302003-11-29 23:52:13 +00007383 if (x < 0 && (type == 'x' || type == 'X' || type == 'o'))
7384 sign = "-";
7385 else
7386 sign = "";
Guido van Rossumd57fd912000-03-10 22:53:23 +00007387 if (prec < 0)
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00007388 prec = 1;
7389
Guido van Rossum6c9e1302003-11-29 23:52:13 +00007390 /* buf = '+'/'-'/'' + '0'/'0x'/'' + '[0-9]'*max(prec, len(x in octal))
7391 * worst case buf = '-0x' + [0-9]*prec, where prec >= 11
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00007392 */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00007393 if (buflen <= 14 || buflen <= (size_t)3 + (size_t)prec) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007394 PyErr_SetString(PyExc_OverflowError,
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00007395 "formatted integer is too long (precision too large?)");
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007396 return -1;
7397 }
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00007398
7399 if ((flags & F_ALT) &&
7400 (type == 'x' || type == 'X')) {
Tim Petersced69f82003-09-16 20:30:58 +00007401 /* When converting under %#x or %#X, there are a number
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00007402 * of issues that cause pain:
7403 * - when 0 is being converted, the C standard leaves off
7404 * the '0x' or '0X', which is inconsistent with other
7405 * %#x/%#X conversions and inconsistent with Python's
7406 * hex() function
7407 * - there are platforms that violate the standard and
7408 * convert 0 with the '0x' or '0X'
7409 * (Metrowerks, Compaq Tru64)
7410 * - there are platforms that give '0x' when converting
Tim Petersced69f82003-09-16 20:30:58 +00007411 * under %#X, but convert 0 in accordance with the
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00007412 * standard (OS/2 EMX)
Tim Petersced69f82003-09-16 20:30:58 +00007413 *
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00007414 * We can achieve the desired consistency by inserting our
7415 * own '0x' or '0X' prefix, and substituting %x/%X in place
7416 * of %#x/%#X.
7417 *
7418 * Note that this is the same approach as used in
7419 * formatint() in stringobject.c
Andrew MacIntyrec4874392002-02-26 11:36:35 +00007420 */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00007421 PyOS_snprintf(fmt, sizeof(fmt), "%s0%c%%.%dl%c",
7422 sign, type, prec, type);
Andrew MacIntyrec4874392002-02-26 11:36:35 +00007423 }
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00007424 else {
Guido van Rossum6c9e1302003-11-29 23:52:13 +00007425 PyOS_snprintf(fmt, sizeof(fmt), "%s%%%s.%dl%c",
7426 sign, (flags&F_ALT) ? "#" : "",
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00007427 prec, type);
Tim Petersb3d8d1f2001-04-28 05:38:26 +00007428 }
Guido van Rossum6c9e1302003-11-29 23:52:13 +00007429 if (sign[0])
Neal Norwitzfc76d632006-01-10 06:03:13 +00007430 return longtounicode(buf, buflen, fmt, -x);
Guido van Rossum6c9e1302003-11-29 23:52:13 +00007431 else
Neal Norwitzfc76d632006-01-10 06:03:13 +00007432 return longtounicode(buf, buflen, fmt, x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007433}
7434
7435static int
7436formatchar(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007437 size_t buflen,
7438 PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007439{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007440 /* presume that the buffer is at least 2 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00007441 if (PyUnicode_Check(v)) {
7442 if (PyUnicode_GET_SIZE(v) != 1)
7443 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007444 buf[0] = PyUnicode_AS_UNICODE(v)[0];
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00007445 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007446
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00007447 else if (PyString_Check(v)) {
Tim Petersced69f82003-09-16 20:30:58 +00007448 if (PyString_GET_SIZE(v) != 1)
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00007449 goto onError;
7450 buf[0] = (Py_UNICODE)PyString_AS_STRING(v)[0];
7451 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007452
7453 else {
7454 /* Integer input truncated to a character */
7455 long x;
7456 x = PyInt_AsLong(v);
7457 if (x == -1 && PyErr_Occurred())
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00007458 goto onError;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00007459#ifdef Py_UNICODE_WIDE
7460 if (x < 0 || x > 0x10ffff) {
Walter Dörwald44f527f2003-04-02 16:37:24 +00007461 PyErr_SetString(PyExc_OverflowError,
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00007462 "%c arg not in range(0x110000) "
7463 "(wide Python build)");
7464 return -1;
7465 }
7466#else
7467 if (x < 0 || x > 0xffff) {
Walter Dörwald44f527f2003-04-02 16:37:24 +00007468 PyErr_SetString(PyExc_OverflowError,
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00007469 "%c arg not in range(0x10000) "
7470 "(narrow Python build)");
7471 return -1;
7472 }
7473#endif
7474 buf[0] = (Py_UNICODE) x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007475 }
7476 buf[1] = '\0';
7477 return 1;
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00007478
7479 onError:
7480 PyErr_SetString(PyExc_TypeError,
7481 "%c requires int or char");
7482 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007483}
7484
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007485/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
7486
7487 FORMATBUFLEN is the length of the buffer in which the floats, ints, &
7488 chars are formatted. XXX This is a magic number. Each formatting
7489 routine does bounds checking to ensure no overflow, but a better
7490 solution may be to malloc a buffer of appropriate size for each
7491 format. For now, the current solution is sufficient.
7492*/
7493#define FORMATBUFLEN (size_t)120
7494
Guido van Rossumd57fd912000-03-10 22:53:23 +00007495PyObject *PyUnicode_Format(PyObject *format,
7496 PyObject *args)
7497{
7498 Py_UNICODE *fmt, *res;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007499 Py_ssize_t fmtcnt, rescnt, reslen, arglen, argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007500 int args_owned = 0;
7501 PyUnicodeObject *result = NULL;
7502 PyObject *dict = NULL;
7503 PyObject *uformat;
Tim Petersced69f82003-09-16 20:30:58 +00007504
Guido van Rossumd57fd912000-03-10 22:53:23 +00007505 if (format == NULL || args == NULL) {
7506 PyErr_BadInternalCall();
7507 return NULL;
7508 }
7509 uformat = PyUnicode_FromObject(format);
Fred Drakee4315f52000-05-09 19:53:39 +00007510 if (uformat == NULL)
7511 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007512 fmt = PyUnicode_AS_UNICODE(uformat);
7513 fmtcnt = PyUnicode_GET_SIZE(uformat);
7514
7515 reslen = rescnt = fmtcnt + 100;
7516 result = _PyUnicode_New(reslen);
7517 if (result == NULL)
7518 goto onError;
7519 res = PyUnicode_AS_UNICODE(result);
7520
7521 if (PyTuple_Check(args)) {
7522 arglen = PyTuple_Size(args);
7523 argidx = 0;
7524 }
7525 else {
7526 arglen = -1;
7527 argidx = -2;
7528 }
Neal Norwitz80a1bf42002-11-12 23:01:12 +00007529 if (args->ob_type->tp_as_mapping && !PyTuple_Check(args) &&
7530 !PyObject_TypeCheck(args, &PyBaseString_Type))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007531 dict = args;
7532
7533 while (--fmtcnt >= 0) {
7534 if (*fmt != '%') {
7535 if (--rescnt < 0) {
7536 rescnt = fmtcnt + 100;
7537 reslen += rescnt;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00007538 if (_PyUnicode_Resize(&result, reslen) < 0)
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00007539 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007540 res = PyUnicode_AS_UNICODE(result) + reslen - rescnt;
7541 --rescnt;
7542 }
7543 *res++ = *fmt++;
7544 }
7545 else {
7546 /* Got a format specifier */
7547 int flags = 0;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007548 Py_ssize_t width = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007549 int prec = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007550 Py_UNICODE c = '\0';
7551 Py_UNICODE fill;
7552 PyObject *v = NULL;
7553 PyObject *temp = NULL;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007554 Py_UNICODE *pbuf;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007555 Py_UNICODE sign;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007556 Py_ssize_t len;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007557 Py_UNICODE formatbuf[FORMATBUFLEN]; /* For format{float,int,char}() */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007558
7559 fmt++;
7560 if (*fmt == '(') {
7561 Py_UNICODE *keystart;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007562 Py_ssize_t keylen;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007563 PyObject *key;
7564 int pcount = 1;
7565
7566 if (dict == NULL) {
7567 PyErr_SetString(PyExc_TypeError,
Tim Petersced69f82003-09-16 20:30:58 +00007568 "format requires a mapping");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007569 goto onError;
7570 }
7571 ++fmt;
7572 --fmtcnt;
7573 keystart = fmt;
7574 /* Skip over balanced parentheses */
7575 while (pcount > 0 && --fmtcnt >= 0) {
7576 if (*fmt == ')')
7577 --pcount;
7578 else if (*fmt == '(')
7579 ++pcount;
7580 fmt++;
7581 }
7582 keylen = fmt - keystart - 1;
7583 if (fmtcnt < 0 || pcount > 0) {
7584 PyErr_SetString(PyExc_ValueError,
7585 "incomplete format key");
7586 goto onError;
7587 }
Marc-André Lemburg72f82132001-11-20 15:18:49 +00007588#if 0
Fred Drakee4315f52000-05-09 19:53:39 +00007589 /* keys are converted to strings using UTF-8 and
Guido van Rossumd57fd912000-03-10 22:53:23 +00007590 then looked up since Python uses strings to hold
7591 variables names etc. in its namespaces and we
Fred Drakee4315f52000-05-09 19:53:39 +00007592 wouldn't want to break common idioms. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007593 key = PyUnicode_EncodeUTF8(keystart,
7594 keylen,
7595 NULL);
Marc-André Lemburg72f82132001-11-20 15:18:49 +00007596#else
7597 key = PyUnicode_FromUnicode(keystart, keylen);
7598#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00007599 if (key == NULL)
7600 goto onError;
7601 if (args_owned) {
7602 Py_DECREF(args);
7603 args_owned = 0;
7604 }
7605 args = PyObject_GetItem(dict, key);
7606 Py_DECREF(key);
7607 if (args == NULL) {
7608 goto onError;
7609 }
7610 args_owned = 1;
7611 arglen = -1;
7612 argidx = -2;
7613 }
7614 while (--fmtcnt >= 0) {
7615 switch (c = *fmt++) {
7616 case '-': flags |= F_LJUST; continue;
7617 case '+': flags |= F_SIGN; continue;
7618 case ' ': flags |= F_BLANK; continue;
7619 case '#': flags |= F_ALT; continue;
7620 case '0': flags |= F_ZERO; continue;
7621 }
7622 break;
7623 }
7624 if (c == '*') {
7625 v = getnextarg(args, arglen, &argidx);
7626 if (v == NULL)
7627 goto onError;
7628 if (!PyInt_Check(v)) {
7629 PyErr_SetString(PyExc_TypeError,
7630 "* wants int");
7631 goto onError;
7632 }
7633 width = PyInt_AsLong(v);
Guido van Rossumddefaf32007-01-14 03:31:43 +00007634 if (width == -1 && PyErr_Occurred())
7635 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007636 if (width < 0) {
7637 flags |= F_LJUST;
7638 width = -width;
7639 }
7640 if (--fmtcnt >= 0)
7641 c = *fmt++;
7642 }
7643 else if (c >= '0' && c <= '9') {
7644 width = c - '0';
7645 while (--fmtcnt >= 0) {
7646 c = *fmt++;
7647 if (c < '0' || c > '9')
7648 break;
7649 if ((width*10) / 10 != width) {
7650 PyErr_SetString(PyExc_ValueError,
7651 "width too big");
7652 goto onError;
7653 }
7654 width = width*10 + (c - '0');
7655 }
7656 }
7657 if (c == '.') {
7658 prec = 0;
7659 if (--fmtcnt >= 0)
7660 c = *fmt++;
7661 if (c == '*') {
7662 v = getnextarg(args, arglen, &argidx);
7663 if (v == NULL)
7664 goto onError;
7665 if (!PyInt_Check(v)) {
7666 PyErr_SetString(PyExc_TypeError,
7667 "* wants int");
7668 goto onError;
7669 }
7670 prec = PyInt_AsLong(v);
Guido van Rossumddefaf32007-01-14 03:31:43 +00007671 if (prec == -1 && PyErr_Occurred())
7672 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007673 if (prec < 0)
7674 prec = 0;
7675 if (--fmtcnt >= 0)
7676 c = *fmt++;
7677 }
7678 else if (c >= '0' && c <= '9') {
7679 prec = c - '0';
7680 while (--fmtcnt >= 0) {
7681 c = Py_CHARMASK(*fmt++);
7682 if (c < '0' || c > '9')
7683 break;
7684 if ((prec*10) / 10 != prec) {
7685 PyErr_SetString(PyExc_ValueError,
7686 "prec too big");
7687 goto onError;
7688 }
7689 prec = prec*10 + (c - '0');
7690 }
7691 }
7692 } /* prec */
7693 if (fmtcnt >= 0) {
7694 if (c == 'h' || c == 'l' || c == 'L') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007695 if (--fmtcnt >= 0)
7696 c = *fmt++;
7697 }
7698 }
7699 if (fmtcnt < 0) {
7700 PyErr_SetString(PyExc_ValueError,
7701 "incomplete format");
7702 goto onError;
7703 }
7704 if (c != '%') {
7705 v = getnextarg(args, arglen, &argidx);
7706 if (v == NULL)
7707 goto onError;
7708 }
7709 sign = 0;
7710 fill = ' ';
7711 switch (c) {
7712
7713 case '%':
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007714 pbuf = formatbuf;
7715 /* presume that buffer length is at least 1 */
7716 pbuf[0] = '%';
Guido van Rossumd57fd912000-03-10 22:53:23 +00007717 len = 1;
7718 break;
7719
7720 case 's':
7721 case 'r':
7722 if (PyUnicode_Check(v) && c == 's') {
7723 temp = v;
7724 Py_INCREF(temp);
7725 }
7726 else {
7727 PyObject *unicode;
7728 if (c == 's')
Marc-André Lemburgd25c6502004-07-23 16:13:25 +00007729 temp = PyObject_Unicode(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007730 else
7731 temp = PyObject_Repr(v);
7732 if (temp == NULL)
7733 goto onError;
Marc-André Lemburgd25c6502004-07-23 16:13:25 +00007734 if (PyUnicode_Check(temp))
7735 /* nothing to do */;
7736 else if (PyString_Check(temp)) {
7737 /* convert to string to Unicode */
Thomas Woutersa96affe2006-03-12 00:29:36 +00007738 unicode = PyUnicode_Decode(PyString_AS_STRING(temp),
Guido van Rossumd57fd912000-03-10 22:53:23 +00007739 PyString_GET_SIZE(temp),
Thomas Woutersa96affe2006-03-12 00:29:36 +00007740 NULL,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007741 "strict");
Thomas Woutersa96affe2006-03-12 00:29:36 +00007742 Py_DECREF(temp);
7743 temp = unicode;
7744 if (temp == NULL)
7745 goto onError;
7746 }
Marc-André Lemburgd25c6502004-07-23 16:13:25 +00007747 else {
7748 Py_DECREF(temp);
7749 PyErr_SetString(PyExc_TypeError,
7750 "%s argument has non-string str()");
7751 goto onError;
7752 }
7753 }
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007754 pbuf = PyUnicode_AS_UNICODE(temp);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007755 len = PyUnicode_GET_SIZE(temp);
7756 if (prec >= 0 && len > prec)
7757 len = prec;
7758 break;
7759
7760 case 'i':
7761 case 'd':
7762 case 'u':
7763 case 'o':
7764 case 'x':
7765 case 'X':
7766 if (c == 'i')
7767 c = 'd';
Tim Petersa3a3a032000-11-30 05:22:44 +00007768 if (PyLong_Check(v)) {
Tim Peters38fd5b62000-09-21 05:43:11 +00007769 temp = formatlong(v, flags, prec, c);
7770 if (!temp)
7771 goto onError;
7772 pbuf = PyUnicode_AS_UNICODE(temp);
7773 len = PyUnicode_GET_SIZE(temp);
Tim Peters38fd5b62000-09-21 05:43:11 +00007774 sign = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007775 }
Tim Peters38fd5b62000-09-21 05:43:11 +00007776 else {
7777 pbuf = formatbuf;
7778 len = formatint(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
7779 flags, prec, c, v);
7780 if (len < 0)
7781 goto onError;
Guido van Rossum6c9e1302003-11-29 23:52:13 +00007782 sign = 1;
Tim Peters38fd5b62000-09-21 05:43:11 +00007783 }
7784 if (flags & F_ZERO)
7785 fill = '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00007786 break;
7787
7788 case 'e':
7789 case 'E':
7790 case 'f':
Raymond Hettinger9bfe5332003-08-27 04:55:52 +00007791 case 'F':
Guido van Rossumd57fd912000-03-10 22:53:23 +00007792 case 'g':
7793 case 'G':
Raymond Hettinger9bfe5332003-08-27 04:55:52 +00007794 if (c == 'F')
7795 c = 'f';
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007796 pbuf = formatbuf;
7797 len = formatfloat(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
7798 flags, prec, c, v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007799 if (len < 0)
7800 goto onError;
7801 sign = 1;
Tim Peters38fd5b62000-09-21 05:43:11 +00007802 if (flags & F_ZERO)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007803 fill = '0';
7804 break;
7805
7806 case 'c':
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007807 pbuf = formatbuf;
7808 len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007809 if (len < 0)
7810 goto onError;
7811 break;
7812
7813 default:
7814 PyErr_Format(PyExc_ValueError,
Andrew M. Kuchling6ca89172000-12-15 13:07:46 +00007815 "unsupported format character '%c' (0x%x) "
Thomas Wouters89f507f2006-12-13 04:49:30 +00007816 "at index %zd",
Tim Petersced69f82003-09-16 20:30:58 +00007817 (31<=c && c<=126) ? (char)c : '?',
Marc-André Lemburg24e53b62002-09-24 09:32:14 +00007818 (int)c,
Thomas Wouters89f507f2006-12-13 04:49:30 +00007819 (Py_ssize_t)(fmt - 1 -
7820 PyUnicode_AS_UNICODE(uformat)));
Guido van Rossumd57fd912000-03-10 22:53:23 +00007821 goto onError;
7822 }
7823 if (sign) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007824 if (*pbuf == '-' || *pbuf == '+') {
7825 sign = *pbuf++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007826 len--;
7827 }
7828 else if (flags & F_SIGN)
7829 sign = '+';
7830 else if (flags & F_BLANK)
7831 sign = ' ';
7832 else
7833 sign = 0;
7834 }
7835 if (width < len)
7836 width = len;
Guido van Rossum049cd6b2002-10-11 00:43:48 +00007837 if (rescnt - (sign != 0) < width) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007838 reslen -= rescnt;
7839 rescnt = width + fmtcnt + 100;
7840 reslen += rescnt;
Guido van Rossum049cd6b2002-10-11 00:43:48 +00007841 if (reslen < 0) {
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00007842 Py_XDECREF(temp);
Thomas Woutersa96affe2006-03-12 00:29:36 +00007843 PyErr_NoMemory();
7844 goto onError;
Guido van Rossum049cd6b2002-10-11 00:43:48 +00007845 }
Thomas Woutersa96affe2006-03-12 00:29:36 +00007846 if (_PyUnicode_Resize(&result, reslen) < 0) {
7847 Py_XDECREF(temp);
7848 goto onError;
7849 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007850 res = PyUnicode_AS_UNICODE(result)
7851 + reslen - rescnt;
7852 }
7853 if (sign) {
7854 if (fill != ' ')
7855 *res++ = sign;
7856 rescnt--;
7857 if (width > len)
7858 width--;
7859 }
Tim Peters38fd5b62000-09-21 05:43:11 +00007860 if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
7861 assert(pbuf[0] == '0');
Tim Petersfff53252001-04-12 18:38:48 +00007862 assert(pbuf[1] == c);
7863 if (fill != ' ') {
7864 *res++ = *pbuf++;
7865 *res++ = *pbuf++;
Tim Peters38fd5b62000-09-21 05:43:11 +00007866 }
Tim Petersfff53252001-04-12 18:38:48 +00007867 rescnt -= 2;
7868 width -= 2;
7869 if (width < 0)
7870 width = 0;
7871 len -= 2;
Tim Peters38fd5b62000-09-21 05:43:11 +00007872 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007873 if (width > len && !(flags & F_LJUST)) {
7874 do {
7875 --rescnt;
7876 *res++ = fill;
7877 } while (--width > len);
7878 }
Tim Peters38fd5b62000-09-21 05:43:11 +00007879 if (fill == ' ') {
7880 if (sign)
7881 *res++ = sign;
Tim Petersfff53252001-04-12 18:38:48 +00007882 if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
Tim Peters38fd5b62000-09-21 05:43:11 +00007883 assert(pbuf[0] == '0');
Tim Petersfff53252001-04-12 18:38:48 +00007884 assert(pbuf[1] == c);
Tim Peters38fd5b62000-09-21 05:43:11 +00007885 *res++ = *pbuf++;
7886 *res++ = *pbuf++;
7887 }
7888 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00007889 Py_UNICODE_COPY(res, pbuf, len);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007890 res += len;
7891 rescnt -= len;
7892 while (--width >= len) {
7893 --rescnt;
7894 *res++ = ' ';
7895 }
7896 if (dict && (argidx < arglen) && c != '%') {
7897 PyErr_SetString(PyExc_TypeError,
Raymond Hettinger0ebac972002-05-21 15:14:57 +00007898 "not all arguments converted during string formatting");
Thomas Woutersa96affe2006-03-12 00:29:36 +00007899 Py_XDECREF(temp);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007900 goto onError;
7901 }
7902 Py_XDECREF(temp);
7903 } /* '%' */
7904 } /* until end */
7905 if (argidx < arglen && !dict) {
7906 PyErr_SetString(PyExc_TypeError,
Raymond Hettinger0ebac972002-05-21 15:14:57 +00007907 "not all arguments converted during string formatting");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007908 goto onError;
7909 }
7910
Thomas Woutersa96affe2006-03-12 00:29:36 +00007911 if (_PyUnicode_Resize(&result, reslen - rescnt) < 0)
7912 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007913 if (args_owned) {
7914 Py_DECREF(args);
7915 }
7916 Py_DECREF(uformat);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007917 return (PyObject *)result;
7918
7919 onError:
7920 Py_XDECREF(result);
7921 Py_DECREF(uformat);
7922 if (args_owned) {
7923 Py_DECREF(args);
7924 }
7925 return NULL;
7926}
7927
7928static PyBufferProcs unicode_as_buffer = {
Martin v. Löwis18e16552006-02-15 17:27:45 +00007929 (readbufferproc) unicode_buffer_getreadbuf,
7930 (writebufferproc) unicode_buffer_getwritebuf,
7931 (segcountproc) unicode_buffer_getsegcount,
7932 (charbufferproc) unicode_buffer_getcharbuf,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007933};
7934
Jeremy Hylton938ace62002-07-17 16:30:39 +00007935static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +00007936unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
7937
Tim Peters6d6c1a32001-08-02 04:15:00 +00007938static PyObject *
7939unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
7940{
7941 PyObject *x = NULL;
Martin v. Löwis15e62742006-02-27 16:46:16 +00007942 static char *kwlist[] = {"string", "encoding", "errors", 0};
Tim Peters6d6c1a32001-08-02 04:15:00 +00007943 char *encoding = NULL;
7944 char *errors = NULL;
7945
Guido van Rossume023fe02001-08-30 03:12:59 +00007946 if (type != &PyUnicode_Type)
7947 return unicode_subtype_new(type, args, kwds);
Tim Peters6d6c1a32001-08-02 04:15:00 +00007948 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:unicode",
7949 kwlist, &x, &encoding, &errors))
7950 return NULL;
7951 if (x == NULL)
7952 return (PyObject *)_PyUnicode_New(0);
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00007953 if (encoding == NULL && errors == NULL)
7954 return PyObject_Unicode(x);
7955 else
Tim Peters6d6c1a32001-08-02 04:15:00 +00007956 return PyUnicode_FromEncodedObject(x, encoding, errors);
7957}
7958
Guido van Rossume023fe02001-08-30 03:12:59 +00007959static PyObject *
7960unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
7961{
Tim Petersaf90b3e2001-09-12 05:18:58 +00007962 PyUnicodeObject *tmp, *pnew;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007963 Py_ssize_t n;
Guido van Rossume023fe02001-08-30 03:12:59 +00007964
7965 assert(PyType_IsSubtype(type, &PyUnicode_Type));
7966 tmp = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds);
7967 if (tmp == NULL)
7968 return NULL;
7969 assert(PyUnicode_Check(tmp));
Tim Petersaf90b3e2001-09-12 05:18:58 +00007970 pnew = (PyUnicodeObject *) type->tp_alloc(type, n = tmp->length);
Raymond Hettingerf4667932003-06-28 20:04:25 +00007971 if (pnew == NULL) {
7972 Py_DECREF(tmp);
Guido van Rossume023fe02001-08-30 03:12:59 +00007973 return NULL;
Raymond Hettingerf4667932003-06-28 20:04:25 +00007974 }
Tim Petersaf90b3e2001-09-12 05:18:58 +00007975 pnew->str = PyMem_NEW(Py_UNICODE, n+1);
7976 if (pnew->str == NULL) {
7977 _Py_ForgetReference((PyObject *)pnew);
Neil Schemenauer58aa8612002-04-12 03:07:20 +00007978 PyObject_Del(pnew);
Raymond Hettingerf4667932003-06-28 20:04:25 +00007979 Py_DECREF(tmp);
Neal Norwitzec74f2f2003-02-11 23:05:40 +00007980 return PyErr_NoMemory();
Guido van Rossume023fe02001-08-30 03:12:59 +00007981 }
Tim Petersaf90b3e2001-09-12 05:18:58 +00007982 Py_UNICODE_COPY(pnew->str, tmp->str, n+1);
7983 pnew->length = n;
7984 pnew->hash = tmp->hash;
Guido van Rossume023fe02001-08-30 03:12:59 +00007985 Py_DECREF(tmp);
Tim Petersaf90b3e2001-09-12 05:18:58 +00007986 return (PyObject *)pnew;
Guido van Rossume023fe02001-08-30 03:12:59 +00007987}
7988
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007989PyDoc_STRVAR(unicode_doc,
Tim Peters6d6c1a32001-08-02 04:15:00 +00007990"unicode(string [, encoding[, errors]]) -> object\n\
7991\n\
7992Create a new Unicode object from the given encoded string.\n\
Skip Montanaro35b37a52002-07-26 16:22:46 +00007993encoding defaults to the current default string encoding.\n\
7994errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +00007995
Guido van Rossum50e9fb92006-08-17 05:42:55 +00007996static PyObject *unicode_iter(PyObject *seq);
7997
Guido van Rossumd57fd912000-03-10 22:53:23 +00007998PyTypeObject PyUnicode_Type = {
7999 PyObject_HEAD_INIT(&PyType_Type)
8000 0, /* ob_size */
Guido van Rossum84fc66d2007-05-03 17:18:26 +00008001 "str", /* tp_name */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008002 sizeof(PyUnicodeObject), /* tp_size */
8003 0, /* tp_itemsize */
8004 /* Slots */
Guido van Rossum9475a232001-10-05 20:51:39 +00008005 (destructor)unicode_dealloc, /* tp_dealloc */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008006 0, /* tp_print */
Tim Peters6d6c1a32001-08-02 04:15:00 +00008007 0, /* tp_getattr */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008008 0, /* tp_setattr */
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00008009 0, /* tp_compare */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008010 unicode_repr, /* tp_repr */
Neil Schemenauerce30bc92002-11-18 16:10:18 +00008011 &unicode_as_number, /* tp_as_number */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008012 &unicode_as_sequence, /* tp_as_sequence */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008013 &unicode_as_mapping, /* tp_as_mapping */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008014 (hashfunc) unicode_hash, /* tp_hash*/
8015 0, /* tp_call*/
8016 (reprfunc) unicode_str, /* tp_str */
Tim Peters6d6c1a32001-08-02 04:15:00 +00008017 PyObject_GenericGetAttr, /* tp_getattro */
8018 0, /* tp_setattro */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008019 &unicode_as_buffer, /* tp_as_buffer */
Thomas Wouters27d517b2007-02-25 20:39:11 +00008020 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
8021 Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
Tim Peters6d6c1a32001-08-02 04:15:00 +00008022 unicode_doc, /* tp_doc */
8023 0, /* tp_traverse */
8024 0, /* tp_clear */
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00008025 PyUnicode_RichCompare, /* tp_richcompare */
Tim Peters6d6c1a32001-08-02 04:15:00 +00008026 0, /* tp_weaklistoffset */
Guido van Rossum50e9fb92006-08-17 05:42:55 +00008027 unicode_iter, /* tp_iter */
Tim Peters6d6c1a32001-08-02 04:15:00 +00008028 0, /* tp_iternext */
8029 unicode_methods, /* tp_methods */
8030 0, /* tp_members */
8031 0, /* tp_getset */
Guido van Rossumcacfc072002-05-24 19:01:59 +00008032 &PyBaseString_Type, /* tp_base */
Tim Peters6d6c1a32001-08-02 04:15:00 +00008033 0, /* tp_dict */
8034 0, /* tp_descr_get */
8035 0, /* tp_descr_set */
8036 0, /* tp_dictoffset */
8037 0, /* tp_init */
8038 0, /* tp_alloc */
8039 unicode_new, /* tp_new */
Neil Schemenauer58aa8612002-04-12 03:07:20 +00008040 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008041};
8042
8043/* Initialize the Unicode implementation */
8044
Thomas Wouters78890102000-07-22 19:25:51 +00008045void _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008046{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008047 int i;
8048
Thomas Wouters477c8d52006-05-27 19:21:47 +00008049 /* XXX - move this array to unicodectype.c ? */
8050 Py_UNICODE linebreak[] = {
8051 0x000A, /* LINE FEED */
8052 0x000D, /* CARRIAGE RETURN */
8053 0x001C, /* FILE SEPARATOR */
8054 0x001D, /* GROUP SEPARATOR */
8055 0x001E, /* RECORD SEPARATOR */
8056 0x0085, /* NEXT LINE */
8057 0x2028, /* LINE SEPARATOR */
8058 0x2029, /* PARAGRAPH SEPARATOR */
8059 };
8060
Fred Drakee4315f52000-05-09 19:53:39 +00008061 /* Init the implementation */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008062 unicode_freelist = NULL;
8063 unicode_freelist_size = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008064 unicode_empty = _PyUnicode_New(0);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008065 if (!unicode_empty)
8066 return;
8067
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008068 for (i = 0; i < 256; i++)
8069 unicode_latin1[i] = NULL;
Guido van Rossumcacfc072002-05-24 19:01:59 +00008070 if (PyType_Ready(&PyUnicode_Type) < 0)
8071 Py_FatalError("Can't initialize 'unicode'");
Thomas Wouters477c8d52006-05-27 19:21:47 +00008072
8073 /* initialize the linebreak bloom filter */
8074 bloom_linebreak = make_bloom_mask(
8075 linebreak, sizeof(linebreak) / sizeof(linebreak[0])
8076 );
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008077
8078 PyType_Ready(&EncodingMapType);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008079}
8080
8081/* Finalize the Unicode implementation */
8082
8083void
Thomas Wouters78890102000-07-22 19:25:51 +00008084_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008085{
Barry Warsaw5b4c2282000-10-03 20:45:26 +00008086 PyUnicodeObject *u;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008087 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008088
Guido van Rossum4ae8ef82000-10-03 18:09:04 +00008089 Py_XDECREF(unicode_empty);
8090 unicode_empty = NULL;
Barry Warsaw5b4c2282000-10-03 20:45:26 +00008091
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008092 for (i = 0; i < 256; i++) {
8093 if (unicode_latin1[i]) {
8094 Py_DECREF(unicode_latin1[i]);
8095 unicode_latin1[i] = NULL;
8096 }
8097 }
8098
Barry Warsaw5b4c2282000-10-03 20:45:26 +00008099 for (u = unicode_freelist; u != NULL;) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00008100 PyUnicodeObject *v = u;
8101 u = *(PyUnicodeObject **)u;
Guido van Rossumfd4b9572000-04-10 13:51:10 +00008102 if (v->str)
Guido van Rossumb18618d2000-05-03 23:44:39 +00008103 PyMem_DEL(v->str);
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00008104 Py_XDECREF(v->defenc);
Neil Schemenauer58aa8612002-04-12 03:07:20 +00008105 PyObject_Del(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008106 }
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008107 unicode_freelist = NULL;
8108 unicode_freelist_size = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008109}
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00008110
Guido van Rossum50e9fb92006-08-17 05:42:55 +00008111
8112
8113/********************* Unicode Iterator **************************/
8114
8115typedef struct {
8116 PyObject_HEAD
Guido van Rossum49d6b072006-08-17 21:11:47 +00008117 Py_ssize_t it_index;
Guido van Rossum50e9fb92006-08-17 05:42:55 +00008118 PyUnicodeObject *it_seq; /* Set to NULL when iterator is exhausted */
8119} unicodeiterobject;
8120
8121static void
8122unicodeiter_dealloc(unicodeiterobject *it)
8123{
8124 _PyObject_GC_UNTRACK(it);
8125 Py_XDECREF(it->it_seq);
8126 PyObject_GC_Del(it);
8127}
8128
8129static int
8130unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
8131{
8132 Py_VISIT(it->it_seq);
8133 return 0;
8134}
8135
8136static PyObject *
8137unicodeiter_next(unicodeiterobject *it)
8138{
8139 PyUnicodeObject *seq;
8140 PyObject *item;
8141
8142 assert(it != NULL);
8143 seq = it->it_seq;
8144 if (seq == NULL)
8145 return NULL;
8146 assert(PyUnicode_Check(seq));
8147
8148 if (it->it_index < PyUnicode_GET_SIZE(seq)) {
Guido van Rossum49d6b072006-08-17 21:11:47 +00008149 item = PyUnicode_FromUnicode(
8150 PyUnicode_AS_UNICODE(seq)+it->it_index, 1);
Guido van Rossum50e9fb92006-08-17 05:42:55 +00008151 if (item != NULL)
8152 ++it->it_index;
8153 return item;
8154 }
8155
8156 Py_DECREF(seq);
8157 it->it_seq = NULL;
8158 return NULL;
8159}
8160
8161static PyObject *
8162unicodeiter_len(unicodeiterobject *it)
8163{
8164 Py_ssize_t len = 0;
8165 if (it->it_seq)
8166 len = PyUnicode_GET_SIZE(it->it_seq) - it->it_index;
8167 return PyInt_FromSsize_t(len);
8168}
8169
8170PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
8171
8172static PyMethodDef unicodeiter_methods[] = {
Guido van Rossum49d6b072006-08-17 21:11:47 +00008173 {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
8174 length_hint_doc},
Guido van Rossum50e9fb92006-08-17 05:42:55 +00008175 {NULL, NULL} /* sentinel */
8176};
8177
8178PyTypeObject PyUnicodeIter_Type = {
8179 PyObject_HEAD_INIT(&PyType_Type)
8180 0, /* ob_size */
8181 "unicodeiterator", /* tp_name */
8182 sizeof(unicodeiterobject), /* tp_basicsize */
8183 0, /* tp_itemsize */
8184 /* methods */
Guido van Rossum49d6b072006-08-17 21:11:47 +00008185 (destructor)unicodeiter_dealloc, /* tp_dealloc */
Guido van Rossum50e9fb92006-08-17 05:42:55 +00008186 0, /* tp_print */
8187 0, /* tp_getattr */
8188 0, /* tp_setattr */
8189 0, /* tp_compare */
8190 0, /* tp_repr */
8191 0, /* tp_as_number */
8192 0, /* tp_as_sequence */
8193 0, /* tp_as_mapping */
8194 0, /* tp_hash */
8195 0, /* tp_call */
8196 0, /* tp_str */
8197 PyObject_GenericGetAttr, /* tp_getattro */
8198 0, /* tp_setattro */
8199 0, /* tp_as_buffer */
8200 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
8201 0, /* tp_doc */
8202 (traverseproc)unicodeiter_traverse, /* tp_traverse */
8203 0, /* tp_clear */
8204 0, /* tp_richcompare */
8205 0, /* tp_weaklistoffset */
8206 PyObject_SelfIter, /* tp_iter */
8207 (iternextfunc)unicodeiter_next, /* tp_iternext */
8208 unicodeiter_methods, /* tp_methods */
8209 0,
8210};
8211
8212static PyObject *
8213unicode_iter(PyObject *seq)
8214{
8215 unicodeiterobject *it;
8216
8217 if (!PyUnicode_Check(seq)) {
8218 PyErr_BadInternalCall();
8219 return NULL;
8220 }
8221 it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
8222 if (it == NULL)
8223 return NULL;
8224 it->it_index = 0;
8225 Py_INCREF(seq);
8226 it->it_seq = (PyUnicodeObject *)seq;
8227 _PyObject_GC_TRACK(it);
8228 return (PyObject *)it;
8229}
8230
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008231#ifdef __cplusplus
8232}
8233#endif
8234
8235
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00008236/*
8237Local variables:
8238c-basic-offset: 4
8239indent-tabs-mode: nil
8240End:
8241*/