blob: c9a922dd80c82663fb956ef3e69b35c89178484e [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Fred Drake785d14f2000-05-09 19:54:43 +00004modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
Guido van Rossumd57fd912000-03-10 22:53:23 +00005Unicode Integration Proposal (see file Misc/unicode.txt).
6
Thomas Wouters477c8d52006-05-27 19:21:47 +00007Major speed upgrades to the method implementations at the Reykjavik
8NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
9
Guido van Rossum16b1ad92000-08-03 16:24:25 +000010Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000011
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000012--------------------------------------------------------------------
13The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000014
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000015 Copyright (c) 1999 by Secret Labs AB
16 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000017
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000018By obtaining, using, and/or copying this software and/or its
19associated documentation, you agree that you have read, understood,
20and will comply with the following terms and conditions:
21
22Permission to use, copy, modify, and distribute this software and its
23associated documentation for any purpose and without fee is hereby
24granted, provided that the above copyright notice appears in all
25copies, and that both that copyright notice and this permission notice
26appear in supporting documentation, and that the name of Secret Labs
27AB or the author not be used in advertising or publicity pertaining to
28distribution of the software without specific, written prior
29permission.
30
31SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
32THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
33FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
34ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
35WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
36ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
37OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
38--------------------------------------------------------------------
39
40*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000041
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000042#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000043#include "Python.h"
44
Guido van Rossumd57fd912000-03-10 22:53:23 +000045#include "unicodeobject.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000046#include "ucnhash.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000047
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000048#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000049#include <windows.h>
50#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000051
Guido van Rossumd57fd912000-03-10 22:53:23 +000052/* Limit for the Unicode object free list */
53
54#define MAX_UNICODE_FREELIST_SIZE 1024
55
56/* Limit for the Unicode object free list stay alive optimization.
57
58 The implementation will keep allocated Unicode memory intact for
59 all objects on the free list having a size less than this
Tim Petersced69f82003-09-16 20:30:58 +000060 limit. This reduces malloc() overhead for small Unicode objects.
Guido van Rossumd57fd912000-03-10 22:53:23 +000061
Barry Warsaw51ac5802000-03-20 16:36:48 +000062 At worst this will result in MAX_UNICODE_FREELIST_SIZE *
Guido van Rossumfd4b9572000-04-10 13:51:10 +000063 (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
Guido van Rossumd57fd912000-03-10 22:53:23 +000064 malloc()-overhead) bytes of unused garbage.
65
66 Setting the limit to 0 effectively turns the feature off.
67
Guido van Rossumfd4b9572000-04-10 13:51:10 +000068 Note: This is an experimental feature ! If you get core dumps when
69 using Unicode objects, turn this feature off.
Guido van Rossumd57fd912000-03-10 22:53:23 +000070
71*/
72
Guido van Rossumfd4b9572000-04-10 13:51:10 +000073#define KEEPALIVE_SIZE_LIMIT 9
Guido van Rossumd57fd912000-03-10 22:53:23 +000074
75/* Endianness switches; defaults to little endian */
76
77#ifdef WORDS_BIGENDIAN
78# define BYTEORDER_IS_BIG_ENDIAN
79#else
80# define BYTEORDER_IS_LITTLE_ENDIAN
81#endif
82
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000083/* --- Globals ------------------------------------------------------------
84
85 The globals are initialized by the _PyUnicode_Init() API and should
86 not be used before calling that API.
87
88*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000089
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000090
91#ifdef __cplusplus
92extern "C" {
93#endif
94
Guido van Rossumd57fd912000-03-10 22:53:23 +000095/* Free list for Unicode objects */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000096static PyUnicodeObject *unicode_freelist;
97static int unicode_freelist_size;
Guido van Rossumd57fd912000-03-10 22:53:23 +000098
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000099/* The empty Unicode object is shared to improve performance. */
100static PyUnicodeObject *unicode_empty;
101
102/* Single character Unicode strings in the Latin-1 range are being
103 shared as well. */
104static PyUnicodeObject *unicode_latin1[256];
105
Fred Drakee4315f52000-05-09 19:53:39 +0000106/* Default encoding to use and assume when NULL is passed as encoding
Guido van Rossumf15a29f2007-05-04 00:41:39 +0000107 parameter; it is fixed to "utf-8". Always use the
108 PyUnicode_GetDefaultEncoding() API to access this global. */
109static const char unicode_default_encoding[] = "utf-8";
Fred Drakee4315f52000-05-09 19:53:39 +0000110
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000111Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000112PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000113{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000114#ifdef Py_UNICODE_WIDE
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000115 return 0x10FFFF;
116#else
117 /* This is actually an illegal character, so it should
118 not be passed to unichr. */
119 return 0xFFFF;
120#endif
121}
122
Thomas Wouters477c8d52006-05-27 19:21:47 +0000123/* --- Bloom Filters ----------------------------------------------------- */
124
125/* stuff to implement simple "bloom filters" for Unicode characters.
126 to keep things simple, we use a single bitmask, using the least 5
127 bits from each unicode characters as the bit index. */
128
129/* the linebreak mask is set up by Unicode_Init below */
130
131#define BLOOM_MASK unsigned long
132
133static BLOOM_MASK bloom_linebreak;
134
135#define BLOOM(mask, ch) ((mask & (1 << ((ch) & 0x1F))))
136
137#define BLOOM_LINEBREAK(ch)\
138 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK((ch)))
139
140Py_LOCAL_INLINE(BLOOM_MASK) make_bloom_mask(Py_UNICODE* ptr, Py_ssize_t len)
141{
142 /* calculate simple bloom-style bitmask for a given unicode string */
143
144 long mask;
145 Py_ssize_t i;
146
147 mask = 0;
148 for (i = 0; i < len; i++)
149 mask |= (1 << (ptr[i] & 0x1F));
150
151 return mask;
152}
153
154Py_LOCAL_INLINE(int) unicode_member(Py_UNICODE chr, Py_UNICODE* set, Py_ssize_t setlen)
155{
156 Py_ssize_t i;
157
158 for (i = 0; i < setlen; i++)
159 if (set[i] == chr)
160 return 1;
161
162 return 0;
163}
164
165#define BLOOM_MEMBER(mask, chr, set, setlen)\
166 BLOOM(mask, chr) && unicode_member(chr, set, setlen)
167
Guido van Rossumd57fd912000-03-10 22:53:23 +0000168/* --- Unicode Object ----------------------------------------------------- */
169
170static
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000171int unicode_resize(register PyUnicodeObject *unicode,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000172 Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000173{
174 void *oldstr;
Tim Petersced69f82003-09-16 20:30:58 +0000175
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000176 /* Shortcut if there's nothing much to do. */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000177 if (unicode->length == length)
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000178 goto reset;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000179
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000180 /* Resizing shared object (unicode_empty or single character
181 objects) in-place is not allowed. Use PyUnicode_Resize()
182 instead ! */
Thomas Wouters477c8d52006-05-27 19:21:47 +0000183
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000184 if (unicode == unicode_empty ||
185 (unicode->length == 1 &&
186 unicode->str[0] < 256U &&
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000187 unicode_latin1[unicode->str[0]] == unicode)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000188 PyErr_SetString(PyExc_SystemError,
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000189 "can't resize shared unicode objects");
Guido van Rossumd57fd912000-03-10 22:53:23 +0000190 return -1;
191 }
192
Thomas Wouters477c8d52006-05-27 19:21:47 +0000193 /* We allocate one more byte to make sure the string is Ux0000 terminated.
194 The overallocation is also used by fastsearch, which assumes that it's
195 safe to look at str[length] (without making any assumptions about what
196 it contains). */
197
Guido van Rossumd57fd912000-03-10 22:53:23 +0000198 oldstr = unicode->str;
199 PyMem_RESIZE(unicode->str, Py_UNICODE, length + 1);
200 if (!unicode->str) {
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000201 unicode->str = (Py_UNICODE *)oldstr;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000202 PyErr_NoMemory();
203 return -1;
204 }
205 unicode->str[length] = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000206 unicode->length = length;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000207
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000208 reset:
Guido van Rossumd57fd912000-03-10 22:53:23 +0000209 /* Reset the object caches */
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000210 if (unicode->defenc) {
211 Py_DECREF(unicode->defenc);
212 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000213 }
214 unicode->hash = -1;
Tim Petersced69f82003-09-16 20:30:58 +0000215
Guido van Rossumd57fd912000-03-10 22:53:23 +0000216 return 0;
217}
218
219/* We allocate one more byte to make sure the string is
Tim Petersced69f82003-09-16 20:30:58 +0000220 Ux0000 terminated -- XXX is this needed ?
Guido van Rossumd57fd912000-03-10 22:53:23 +0000221
222 XXX This allocator could further be enhanced by assuring that the
223 free list never reduces its size below 1.
224
225*/
226
227static
Martin v. Löwis18e16552006-02-15 17:27:45 +0000228PyUnicodeObject *_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000229{
230 register PyUnicodeObject *unicode;
231
Thomas Wouters477c8d52006-05-27 19:21:47 +0000232 /* Optimization for empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000233 if (length == 0 && unicode_empty != NULL) {
234 Py_INCREF(unicode_empty);
235 return unicode_empty;
236 }
237
238 /* Unicode freelist & memory allocation */
239 if (unicode_freelist) {
240 unicode = unicode_freelist;
Marc-André Lemburgbea47e72000-06-17 20:31:17 +0000241 unicode_freelist = *(PyUnicodeObject **)unicode;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000242 unicode_freelist_size--;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000243 if (unicode->str) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000244 /* Keep-Alive optimization: we only upsize the buffer,
245 never downsize it. */
Tim Petersced69f82003-09-16 20:30:58 +0000246 if ((unicode->length < length) &&
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +0000247 unicode_resize(unicode, length) < 0) {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000248 PyMem_DEL(unicode->str);
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000249 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000250 }
251 }
Guido van Rossumad98db12001-06-14 17:52:02 +0000252 else {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000253 unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
Guido van Rossumad98db12001-06-14 17:52:02 +0000254 }
255 PyObject_INIT(unicode, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000256 }
257 else {
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000258 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000259 if (unicode == NULL)
260 return NULL;
261 unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
262 }
263
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000264 if (!unicode->str) {
265 PyErr_NoMemory();
Barry Warsaw51ac5802000-03-20 16:36:48 +0000266 goto onError;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000267 }
Jeremy Hyltond8082792003-09-16 19:41:39 +0000268 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +0000269 * the caller fails before initializing str -- unicode_resize()
270 * reads str[0], and the Keep-Alive optimization can keep memory
271 * allocated for str alive across a call to unicode_dealloc(unicode).
272 * We don't want unicode_resize to read uninitialized memory in
273 * that case.
274 */
Jeremy Hyltond8082792003-09-16 19:41:39 +0000275 unicode->str[0] = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000276 unicode->str[length] = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000277 unicode->length = length;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000278 unicode->hash = -1;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000279 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000280 return unicode;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000281
282 onError:
283 _Py_ForgetReference((PyObject *)unicode);
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000284 PyObject_Del(unicode);
Barry Warsaw51ac5802000-03-20 16:36:48 +0000285 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000286}
287
288static
Guido van Rossum9475a232001-10-05 20:51:39 +0000289void unicode_dealloc(register PyUnicodeObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000290{
Guido van Rossum604ddf82001-12-06 20:03:56 +0000291 if (PyUnicode_CheckExact(unicode) &&
292 unicode_freelist_size < MAX_UNICODE_FREELIST_SIZE) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000293 /* Keep-Alive optimization */
294 if (unicode->length >= KEEPALIVE_SIZE_LIMIT) {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000295 PyMem_DEL(unicode->str);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000296 unicode->str = NULL;
297 unicode->length = 0;
298 }
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000299 if (unicode->defenc) {
300 Py_DECREF(unicode->defenc);
301 unicode->defenc = NULL;
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000302 }
303 /* Add to free list */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000304 *(PyUnicodeObject **)unicode = unicode_freelist;
305 unicode_freelist = unicode;
306 unicode_freelist_size++;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000307 }
308 else {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000309 PyMem_DEL(unicode->str);
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000310 Py_XDECREF(unicode->defenc);
Guido van Rossum604ddf82001-12-06 20:03:56 +0000311 unicode->ob_type->tp_free((PyObject *)unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000312 }
313}
314
Martin v. Löwis18e16552006-02-15 17:27:45 +0000315int PyUnicode_Resize(PyObject **unicode, Py_ssize_t length)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000316{
317 register PyUnicodeObject *v;
318
319 /* Argument checks */
320 if (unicode == NULL) {
321 PyErr_BadInternalCall();
322 return -1;
323 }
324 v = (PyUnicodeObject *)*unicode;
Guido van Rossum049cd6b2002-10-11 00:43:48 +0000325 if (v == NULL || !PyUnicode_Check(v) || v->ob_refcnt != 1 || length < 0) {
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000326 PyErr_BadInternalCall();
327 return -1;
328 }
329
330 /* Resizing unicode_empty and single character objects is not
331 possible since these are being shared. We simply return a fresh
332 copy with the same Unicode content. */
Tim Petersced69f82003-09-16 20:30:58 +0000333 if (v->length != length &&
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000334 (v == unicode_empty || v->length == 1)) {
335 PyUnicodeObject *w = _PyUnicode_New(length);
336 if (w == NULL)
337 return -1;
338 Py_UNICODE_COPY(w->str, v->str,
339 length < v->length ? length : v->length);
Raymond Hettingerc8df5782003-03-09 07:30:43 +0000340 Py_DECREF(*unicode);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000341 *unicode = (PyObject *)w;
342 return 0;
343 }
344
345 /* Note that we don't have to modify *unicode for unshared Unicode
346 objects, since we can modify them in-place. */
347 return unicode_resize(v, length);
348}
349
350/* Internal API for use in unicodeobject.c only ! */
351#define _PyUnicode_Resize(unicodevar, length) \
352 PyUnicode_Resize(((PyObject **)(unicodevar)), length)
353
Guido van Rossumd57fd912000-03-10 22:53:23 +0000354PyObject *PyUnicode_FromUnicode(const Py_UNICODE *u,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000355 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000356{
357 PyUnicodeObject *unicode;
358
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000359 /* If the Unicode data is known at construction time, we can apply
360 some optimizations which share commonly used objects. */
361 if (u != NULL) {
362
363 /* Optimization for empty strings */
364 if (size == 0 && unicode_empty != NULL) {
365 Py_INCREF(unicode_empty);
366 return (PyObject *)unicode_empty;
367 }
368
369 /* Single character Unicode objects in the Latin-1 range are
370 shared when using this constructor */
371 if (size == 1 && *u < 256) {
372 unicode = unicode_latin1[*u];
373 if (!unicode) {
374 unicode = _PyUnicode_New(1);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000375 if (!unicode)
376 return NULL;
Marc-André Lemburg8879a332001-06-07 12:26:56 +0000377 unicode->str[0] = *u;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000378 unicode_latin1[*u] = unicode;
379 }
380 Py_INCREF(unicode);
381 return (PyObject *)unicode;
382 }
383 }
Tim Petersced69f82003-09-16 20:30:58 +0000384
Guido van Rossumd57fd912000-03-10 22:53:23 +0000385 unicode = _PyUnicode_New(size);
386 if (!unicode)
387 return NULL;
388
389 /* Copy the Unicode data into the new object */
390 if (u != NULL)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000391 Py_UNICODE_COPY(unicode->str, u, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000392
393 return (PyObject *)unicode;
394}
395
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000396PyObject *PyUnicode_FromString(const char *u)
397{
398 PyUnicodeObject *unicode;
399 Py_ssize_t size = strlen(u);
400
401 /* If the Unicode data is known at construction time, we can apply
402 some optimizations which share commonly used objects. */
403 if (u != NULL) {
404
405 /* Optimization for empty strings */
406 if (size == 0 && unicode_empty != NULL) {
407 Py_INCREF(unicode_empty);
408 return (PyObject *)unicode_empty;
409 }
410
411 /* Single character Unicode objects in the Latin-1 range are
412 shared when using this constructor */
413 if (size == 1 && *u < 256) {
414 unicode = unicode_latin1[*u];
415 if (!unicode) {
416 unicode = _PyUnicode_New(1);
417 if (!unicode)
418 return NULL;
419 unicode->str[0] = *u;
420 unicode_latin1[*u] = unicode;
421 }
422 Py_INCREF(unicode);
423 return (PyObject *)unicode;
424 }
425 }
426
427 unicode = _PyUnicode_New(size);
428 if (!unicode)
429 return NULL;
430
431 /* Copy the Unicode data into the new object */
432 if (u != NULL) {
433 char *p = unicode->str;
434 while (*p++ = *u++)
435 ;
436 }
437
438 return (PyObject *)unicode;
439}
440
Guido van Rossumd57fd912000-03-10 22:53:23 +0000441#ifdef HAVE_WCHAR_H
442
443PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000444 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000445{
446 PyUnicodeObject *unicode;
447
448 if (w == NULL) {
449 PyErr_BadInternalCall();
450 return NULL;
451 }
452
453 unicode = _PyUnicode_New(size);
454 if (!unicode)
455 return NULL;
456
457 /* Copy the wchar_t data into the new object */
458#ifdef HAVE_USABLE_WCHAR_T
459 memcpy(unicode->str, w, size * sizeof(wchar_t));
Tim Petersced69f82003-09-16 20:30:58 +0000460#else
Guido van Rossumd57fd912000-03-10 22:53:23 +0000461 {
462 register Py_UNICODE *u;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000463 register Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000464 u = PyUnicode_AS_UNICODE(unicode);
Marc-André Lemburg204bd6d2004-10-15 07:45:05 +0000465 for (i = size; i > 0; i--)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000466 *u++ = *w++;
467 }
468#endif
469
470 return (PyObject *)unicode;
471}
472
Martin v. Löwis18e16552006-02-15 17:27:45 +0000473Py_ssize_t PyUnicode_AsWideChar(PyUnicodeObject *unicode,
474 wchar_t *w,
475 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000476{
477 if (unicode == NULL) {
478 PyErr_BadInternalCall();
479 return -1;
480 }
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +0000481
482 /* If possible, try to copy the 0-termination as well */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000483 if (size > PyUnicode_GET_SIZE(unicode))
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +0000484 size = PyUnicode_GET_SIZE(unicode) + 1;
485
Guido van Rossumd57fd912000-03-10 22:53:23 +0000486#ifdef HAVE_USABLE_WCHAR_T
487 memcpy(w, unicode->str, size * sizeof(wchar_t));
488#else
489 {
490 register Py_UNICODE *u;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000491 register Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000492 u = PyUnicode_AS_UNICODE(unicode);
Marc-André Lemburg204bd6d2004-10-15 07:45:05 +0000493 for (i = size; i > 0; i--)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000494 *w++ = *u++;
495 }
496#endif
497
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +0000498 if (size > PyUnicode_GET_SIZE(unicode))
499 return PyUnicode_GET_SIZE(unicode);
500 else
Guido van Rossumd57fd912000-03-10 22:53:23 +0000501 return size;
502}
503
504#endif
505
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000506PyObject *PyUnicode_FromOrdinal(int ordinal)
507{
Hye-Shik Chang40574832004-04-06 07:24:51 +0000508 Py_UNICODE s[1];
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000509
510#ifdef Py_UNICODE_WIDE
511 if (ordinal < 0 || ordinal > 0x10ffff) {
512 PyErr_SetString(PyExc_ValueError,
513 "unichr() arg not in range(0x110000) "
514 "(wide Python build)");
515 return NULL;
516 }
517#else
518 if (ordinal < 0 || ordinal > 0xffff) {
519 PyErr_SetString(PyExc_ValueError,
520 "unichr() arg not in range(0x10000) "
521 "(narrow Python build)");
522 return NULL;
523 }
524#endif
525
Hye-Shik Chang40574832004-04-06 07:24:51 +0000526 s[0] = (Py_UNICODE)ordinal;
527 return PyUnicode_FromUnicode(s, 1);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000528}
529
Guido van Rossumd57fd912000-03-10 22:53:23 +0000530PyObject *PyUnicode_FromObject(register PyObject *obj)
531{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000532 /* XXX Perhaps we should make this API an alias of
533 PyObject_Unicode() instead ?! */
534 if (PyUnicode_CheckExact(obj)) {
535 Py_INCREF(obj);
536 return obj;
537 }
538 if (PyUnicode_Check(obj)) {
539 /* For a Unicode subtype that's not a Unicode object,
540 return a true Unicode object with the same data. */
541 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(obj),
542 PyUnicode_GET_SIZE(obj));
543 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000544 return PyUnicode_FromEncodedObject(obj, NULL, "strict");
545}
546
547PyObject *PyUnicode_FromEncodedObject(register PyObject *obj,
548 const char *encoding,
549 const char *errors)
550{
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000551 const char *s = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +0000552 Py_ssize_t len;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000553 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +0000554
Guido van Rossumd57fd912000-03-10 22:53:23 +0000555 if (obj == NULL) {
556 PyErr_BadInternalCall();
557 return NULL;
558 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000559
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000560#if 0
561 /* For b/w compatibility we also accept Unicode objects provided
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +0000562 that no encodings is given and then redirect to
563 PyObject_Unicode() which then applies the additional logic for
564 Unicode subclasses.
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000565
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000566 NOTE: This API should really only be used for object which
567 represent *encoded* Unicode !
568
569 */
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000570 if (PyUnicode_Check(obj)) {
571 if (encoding) {
572 PyErr_SetString(PyExc_TypeError,
573 "decoding Unicode is not supported");
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000574 return NULL;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000575 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000576 return PyObject_Unicode(obj);
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000577 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000578#else
579 if (PyUnicode_Check(obj)) {
580 PyErr_SetString(PyExc_TypeError,
581 "decoding Unicode is not supported");
582 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000583 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000584#endif
585
586 /* Coerce object */
587 if (PyString_Check(obj)) {
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000588 s = PyString_AS_STRING(obj);
589 len = PyString_GET_SIZE(obj);
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000590 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000591 else if (PyObject_AsCharBuffer(obj, &s, &len)) {
592 /* Overwrite the error message with something more useful in
593 case of a TypeError. */
594 if (PyErr_ExceptionMatches(PyExc_TypeError))
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000595 PyErr_Format(PyExc_TypeError,
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000596 "coercing to Unicode: need string or buffer, "
597 "%.80s found",
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000598 obj->ob_type->tp_name);
599 goto onError;
600 }
Tim Petersced69f82003-09-16 20:30:58 +0000601
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000602 /* Convert to Unicode */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000603 if (len == 0) {
604 Py_INCREF(unicode_empty);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000605 v = (PyObject *)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000606 }
Tim Petersced69f82003-09-16 20:30:58 +0000607 else
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000608 v = PyUnicode_Decode(s, len, encoding, errors);
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +0000609
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000610 return v;
611
612 onError:
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000613 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000614}
615
616PyObject *PyUnicode_Decode(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000617 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +0000618 const char *encoding,
619 const char *errors)
620{
621 PyObject *buffer = NULL, *unicode;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000622
623 if (encoding == NULL)
Fred Drakee4315f52000-05-09 19:53:39 +0000624 encoding = PyUnicode_GetDefaultEncoding();
625
626 /* Shortcuts for common default encodings */
627 if (strcmp(encoding, "utf-8") == 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000628 return PyUnicode_DecodeUTF8(s, size, errors);
Fred Drakee4315f52000-05-09 19:53:39 +0000629 else if (strcmp(encoding, "latin-1") == 0)
630 return PyUnicode_DecodeLatin1(s, size, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +0000631#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
632 else if (strcmp(encoding, "mbcs") == 0)
633 return PyUnicode_DecodeMBCS(s, size, errors);
634#endif
Fred Drakee4315f52000-05-09 19:53:39 +0000635 else if (strcmp(encoding, "ascii") == 0)
636 return PyUnicode_DecodeASCII(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000637
638 /* Decode via the codec registry */
639 buffer = PyBuffer_FromMemory((void *)s, size);
640 if (buffer == NULL)
641 goto onError;
642 unicode = PyCodec_Decode(buffer, encoding, errors);
643 if (unicode == NULL)
644 goto onError;
645 if (!PyUnicode_Check(unicode)) {
646 PyErr_Format(PyExc_TypeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000647 "decoder did not return an unicode object (type=%.400s)",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000648 unicode->ob_type->tp_name);
649 Py_DECREF(unicode);
650 goto onError;
651 }
652 Py_DECREF(buffer);
653 return unicode;
Tim Petersced69f82003-09-16 20:30:58 +0000654
Guido van Rossumd57fd912000-03-10 22:53:23 +0000655 onError:
656 Py_XDECREF(buffer);
657 return NULL;
658}
659
Marc-André Lemburgd2d45982004-07-08 17:57:32 +0000660PyObject *PyUnicode_AsDecodedObject(PyObject *unicode,
661 const char *encoding,
662 const char *errors)
663{
664 PyObject *v;
665
666 if (!PyUnicode_Check(unicode)) {
667 PyErr_BadArgument();
668 goto onError;
669 }
670
671 if (encoding == NULL)
672 encoding = PyUnicode_GetDefaultEncoding();
673
674 /* Decode via the codec registry */
675 v = PyCodec_Decode(unicode, encoding, errors);
676 if (v == NULL)
677 goto onError;
678 return v;
679
680 onError:
681 return NULL;
682}
683
Guido van Rossumd57fd912000-03-10 22:53:23 +0000684PyObject *PyUnicode_Encode(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000685 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +0000686 const char *encoding,
687 const char *errors)
688{
689 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +0000690
Guido van Rossumd57fd912000-03-10 22:53:23 +0000691 unicode = PyUnicode_FromUnicode(s, size);
692 if (unicode == NULL)
693 return NULL;
694 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
695 Py_DECREF(unicode);
696 return v;
697}
698
Marc-André Lemburgd2d45982004-07-08 17:57:32 +0000699PyObject *PyUnicode_AsEncodedObject(PyObject *unicode,
700 const char *encoding,
701 const char *errors)
702{
703 PyObject *v;
704
705 if (!PyUnicode_Check(unicode)) {
706 PyErr_BadArgument();
707 goto onError;
708 }
709
710 if (encoding == NULL)
711 encoding = PyUnicode_GetDefaultEncoding();
712
713 /* Encode via the codec registry */
714 v = PyCodec_Encode(unicode, encoding, errors);
715 if (v == NULL)
716 goto onError;
717 return v;
718
719 onError:
720 return NULL;
721}
722
Guido van Rossumd57fd912000-03-10 22:53:23 +0000723PyObject *PyUnicode_AsEncodedString(PyObject *unicode,
724 const char *encoding,
725 const char *errors)
726{
727 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +0000728
Guido van Rossumd57fd912000-03-10 22:53:23 +0000729 if (!PyUnicode_Check(unicode)) {
730 PyErr_BadArgument();
731 goto onError;
732 }
Fred Drakee4315f52000-05-09 19:53:39 +0000733
Tim Petersced69f82003-09-16 20:30:58 +0000734 if (encoding == NULL)
Fred Drakee4315f52000-05-09 19:53:39 +0000735 encoding = PyUnicode_GetDefaultEncoding();
736
737 /* Shortcuts for common default encodings */
738 if (errors == NULL) {
739 if (strcmp(encoding, "utf-8") == 0)
Jeremy Hylton9cea41c2001-05-29 17:13:15 +0000740 return PyUnicode_AsUTF8String(unicode);
Fred Drakee4315f52000-05-09 19:53:39 +0000741 else if (strcmp(encoding, "latin-1") == 0)
742 return PyUnicode_AsLatin1String(unicode);
Mark Hammond0ccda1e2003-07-01 00:13:27 +0000743#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
744 else if (strcmp(encoding, "mbcs") == 0)
745 return PyUnicode_AsMBCSString(unicode);
746#endif
Fred Drakee4315f52000-05-09 19:53:39 +0000747 else if (strcmp(encoding, "ascii") == 0)
748 return PyUnicode_AsASCIIString(unicode);
749 }
Guido van Rossumd57fd912000-03-10 22:53:23 +0000750
751 /* Encode via the codec registry */
752 v = PyCodec_Encode(unicode, encoding, errors);
753 if (v == NULL)
754 goto onError;
Guido van Rossumf15a29f2007-05-04 00:41:39 +0000755 if (!PyBytes_Check(v)) {
756 if (PyString_Check(v)) {
757 /* Old codec, turn it into bytes */
758 PyObject *b = PyBytes_FromObject(v);
759 Py_DECREF(v);
760 return b;
761 }
Guido van Rossumd57fd912000-03-10 22:53:23 +0000762 PyErr_Format(PyExc_TypeError,
Guido van Rossumf15a29f2007-05-04 00:41:39 +0000763 "encoder did not return a bytes object "
764 "(type=%.400s, encoding=%.20s, errors=%.20s)",
765 v->ob_type->tp_name,
766 encoding ? encoding : "NULL",
767 errors ? errors : "NULL");
Guido van Rossumd57fd912000-03-10 22:53:23 +0000768 Py_DECREF(v);
769 goto onError;
770 }
771 return v;
Tim Petersced69f82003-09-16 20:30:58 +0000772
Guido van Rossumd57fd912000-03-10 22:53:23 +0000773 onError:
774 return NULL;
775}
776
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000777PyObject *_PyUnicode_AsDefaultEncodedString(PyObject *unicode,
778 const char *errors)
779{
780 PyObject *v = ((PyUnicodeObject *)unicode)->defenc;
Guido van Rossumf15a29f2007-05-04 00:41:39 +0000781 PyObject *b;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000782 if (v)
783 return v;
Guido van Rossumf15a29f2007-05-04 00:41:39 +0000784 if (errors != NULL)
785 Py_FatalError("non-NULL encoding in _PyUnicode_AsDefaultEncodedString");
786 if (errors == NULL) {
787 b = PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
788 PyUnicode_GET_SIZE(unicode),
789 NULL);
790 }
791 else {
792 b = PyUnicode_AsEncodedString(unicode, NULL, errors);
793 }
794 if (!b)
795 return NULL;
796 v = PyString_FromStringAndSize(PyBytes_AsString(b),
797 PyBytes_Size(b));
798 Py_DECREF(b);
799 if (!errors) {
800 Py_XINCREF(v);
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000801 ((PyUnicodeObject *)unicode)->defenc = v;
Guido van Rossumf15a29f2007-05-04 00:41:39 +0000802 }
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000803 return v;
804}
805
Guido van Rossumd57fd912000-03-10 22:53:23 +0000806Py_UNICODE *PyUnicode_AsUnicode(PyObject *unicode)
807{
808 if (!PyUnicode_Check(unicode)) {
809 PyErr_BadArgument();
810 goto onError;
811 }
812 return PyUnicode_AS_UNICODE(unicode);
813
814 onError:
815 return NULL;
816}
817
Martin v. Löwis18e16552006-02-15 17:27:45 +0000818Py_ssize_t PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000819{
820 if (!PyUnicode_Check(unicode)) {
821 PyErr_BadArgument();
822 goto onError;
823 }
824 return PyUnicode_GET_SIZE(unicode);
825
826 onError:
827 return -1;
828}
829
Thomas Wouters78890102000-07-22 19:25:51 +0000830const char *PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +0000831{
832 return unicode_default_encoding;
833}
834
835int PyUnicode_SetDefaultEncoding(const char *encoding)
836{
Guido van Rossumf15a29f2007-05-04 00:41:39 +0000837 if (strcmp(encoding, unicode_default_encoding) != 0) {
838 PyErr_Format(PyExc_ValueError,
839 "Can only set default encoding to %s",
840 unicode_default_encoding);
841 return -1;
842 }
Fred Drakee4315f52000-05-09 19:53:39 +0000843 return 0;
Fred Drakee4315f52000-05-09 19:53:39 +0000844}
845
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000846/* error handling callback helper:
847 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +0000848 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000849 and adjust various state variables.
850 return 0 on success, -1 on error
851*/
852
853static
854int unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
855 const char *encoding, const char *reason,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000856 const char *input, Py_ssize_t insize, Py_ssize_t *startinpos, Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
857 PyObject **output, Py_ssize_t *outpos, Py_UNICODE **outptr)
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000858{
Martin v. Löwis18e16552006-02-15 17:27:45 +0000859 static char *argparse = "O!n;decoding error handler must return (unicode, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000860
861 PyObject *restuple = NULL;
862 PyObject *repunicode = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +0000863 Py_ssize_t outsize = PyUnicode_GET_SIZE(*output);
864 Py_ssize_t requiredsize;
865 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000866 Py_UNICODE *repptr;
Martin v. Löwis18e16552006-02-15 17:27:45 +0000867 Py_ssize_t repsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000868 int res = -1;
869
870 if (*errorHandler == NULL) {
871 *errorHandler = PyCodec_LookupError(errors);
872 if (*errorHandler == NULL)
873 goto onError;
874 }
875
876 if (*exceptionObject == NULL) {
877 *exceptionObject = PyUnicodeDecodeError_Create(
878 encoding, input, insize, *startinpos, *endinpos, reason);
879 if (*exceptionObject == NULL)
880 goto onError;
881 }
882 else {
883 if (PyUnicodeDecodeError_SetStart(*exceptionObject, *startinpos))
884 goto onError;
885 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, *endinpos))
886 goto onError;
887 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
888 goto onError;
889 }
890
891 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
892 if (restuple == NULL)
893 goto onError;
894 if (!PyTuple_Check(restuple)) {
895 PyErr_Format(PyExc_TypeError, &argparse[4]);
896 goto onError;
897 }
898 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
899 goto onError;
900 if (newpos<0)
Walter Dörwald2e0b18a2003-01-31 17:19:08 +0000901 newpos = insize+newpos;
902 if (newpos<0 || newpos>insize) {
Martin v. Löwis2c95cc62006-02-16 06:54:25 +0000903 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
Walter Dörwald2e0b18a2003-01-31 17:19:08 +0000904 goto onError;
905 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000906
907 /* need more space? (at least enough for what we
908 have+the replacement+the rest of the string (starting
909 at the new input position), so we won't have to check space
910 when there are no errors in the rest of the string) */
911 repptr = PyUnicode_AS_UNICODE(repunicode);
912 repsize = PyUnicode_GET_SIZE(repunicode);
913 requiredsize = *outpos + repsize + insize-newpos;
914 if (requiredsize > outsize) {
915 if (requiredsize<2*outsize)
916 requiredsize = 2*outsize;
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +0000917 if (PyUnicode_Resize(output, requiredsize) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000918 goto onError;
919 *outptr = PyUnicode_AS_UNICODE(*output) + *outpos;
920 }
921 *endinpos = newpos;
922 *inptr = input + newpos;
923 Py_UNICODE_COPY(*outptr, repptr, repsize);
924 *outptr += repsize;
925 *outpos += repsize;
926 /* we made it! */
927 res = 0;
928
929 onError:
930 Py_XDECREF(restuple);
931 return res;
932}
933
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000934/* --- UTF-7 Codec -------------------------------------------------------- */
935
936/* see RFC2152 for details */
937
Tim Petersced69f82003-09-16 20:30:58 +0000938static
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000939char utf7_special[128] = {
940 /* indicate whether a UTF-7 character is special i.e. cannot be directly
941 encoded:
942 0 - not special
943 1 - special
944 2 - whitespace (optional)
945 3 - RFC2152 Set O (optional) */
946 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 2, 1, 1,
947 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
948 2, 3, 3, 3, 3, 3, 3, 0, 0, 0, 3, 1, 0, 0, 0, 1,
949 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 0,
950 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
951 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 1, 3, 3, 3,
952 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
953 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 1, 1,
954
955};
956
Marc-André Lemburge115ec82005-10-19 22:33:31 +0000957/* Note: The comparison (c) <= 0 is a trick to work-around gcc
958 warnings about the comparison always being false; since
959 utf7_special[0] is 1, we can safely make that one comparison
960 true */
961
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000962#define SPECIAL(c, encodeO, encodeWS) \
Marc-André Lemburge115ec82005-10-19 22:33:31 +0000963 ((c) > 127 || (c) <= 0 || utf7_special[(c)] == 1 || \
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +0000964 (encodeWS && (utf7_special[(c)] == 2)) || \
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000965 (encodeO && (utf7_special[(c)] == 3)))
966
Marc-André Lemburge115ec82005-10-19 22:33:31 +0000967#define B64(n) \
968 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
969#define B64CHAR(c) \
970 (isalnum(c) || (c) == '+' || (c) == '/')
971#define UB64(c) \
972 ((c) == '+' ? 62 : (c) == '/' ? 63 : (c) >= 'a' ? \
973 (c) - 71 : (c) >= 'A' ? (c) - 65 : (c) + 4 )
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000974
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +0000975#define ENCODE(out, ch, bits) \
976 while (bits >= 6) { \
977 *out++ = B64(ch >> (bits-6)); \
978 bits -= 6; \
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000979 }
980
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +0000981#define DECODE(out, ch, bits, surrogate) \
982 while (bits >= 16) { \
983 Py_UNICODE outCh = (Py_UNICODE) ((ch >> (bits-16)) & 0xffff); \
984 bits -= 16; \
985 if (surrogate) { \
Marc-André Lemburge115ec82005-10-19 22:33:31 +0000986 /* We have already generated an error for the high surrogate \
987 so let's not bother seeing if the low surrogate is correct or not */ \
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +0000988 surrogate = 0; \
989 } else if (0xDC00 <= outCh && outCh <= 0xDFFF) { \
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000990 /* This is a surrogate pair. Unfortunately we can't represent \
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +0000991 it in a 16-bit character */ \
992 surrogate = 1; \
993 errmsg = "code pairs are not supported"; \
994 goto utf7Error; \
995 } else { \
996 *out++ = outCh; \
997 } \
Marc-André Lemburge115ec82005-10-19 22:33:31 +0000998 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000999
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001000PyObject *PyUnicode_DecodeUTF7(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001001 Py_ssize_t size,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001002 const char *errors)
1003{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001004 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001005 Py_ssize_t startinpos;
1006 Py_ssize_t endinpos;
1007 Py_ssize_t outpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001008 const char *e;
1009 PyUnicodeObject *unicode;
1010 Py_UNICODE *p;
1011 const char *errmsg = "";
1012 int inShift = 0;
1013 unsigned int bitsleft = 0;
1014 unsigned long charsleft = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001015 int surrogate = 0;
1016 PyObject *errorHandler = NULL;
1017 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001018
1019 unicode = _PyUnicode_New(size);
1020 if (!unicode)
1021 return NULL;
1022 if (size == 0)
1023 return (PyObject *)unicode;
1024
1025 p = unicode->str;
1026 e = s + size;
1027
1028 while (s < e) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001029 Py_UNICODE ch;
1030 restart:
1031 ch = *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001032
1033 if (inShift) {
1034 if ((ch == '-') || !B64CHAR(ch)) {
1035 inShift = 0;
1036 s++;
Tim Petersced69f82003-09-16 20:30:58 +00001037
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001038 /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
1039 if (bitsleft >= 6) {
1040 /* The shift sequence has a partial character in it. If
1041 bitsleft < 6 then we could just classify it as padding
1042 but that is not the case here */
1043
1044 errmsg = "partial character in shift sequence";
Tim Petersced69f82003-09-16 20:30:58 +00001045 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001046 }
1047 /* According to RFC2152 the remaining bits should be zero. We
Tim Petersced69f82003-09-16 20:30:58 +00001048 choose to signal an error/insert a replacement character
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001049 here so indicate the potential of a misencoded character. */
1050
1051 /* On x86, a << b == a << (b%32) so make sure that bitsleft != 0 */
1052 if (bitsleft && charsleft << (sizeof(charsleft) * 8 - bitsleft)) {
1053 errmsg = "non-zero padding bits in shift sequence";
Tim Petersced69f82003-09-16 20:30:58 +00001054 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001055 }
1056
1057 if (ch == '-') {
1058 if ((s < e) && (*(s) == '-')) {
Tim Petersced69f82003-09-16 20:30:58 +00001059 *p++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001060 inShift = 1;
1061 }
1062 } else if (SPECIAL(ch,0,0)) {
1063 errmsg = "unexpected special character";
Tim Petersced69f82003-09-16 20:30:58 +00001064 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001065 } else {
1066 *p++ = ch;
1067 }
1068 } else {
1069 charsleft = (charsleft << 6) | UB64(ch);
1070 bitsleft += 6;
1071 s++;
1072 /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
1073 }
1074 }
1075 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001076 startinpos = s-starts;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001077 s++;
1078 if (s < e && *s == '-') {
1079 s++;
1080 *p++ = '+';
1081 } else
1082 {
1083 inShift = 1;
1084 bitsleft = 0;
1085 }
1086 }
1087 else if (SPECIAL(ch,0,0)) {
1088 errmsg = "unexpected special character";
1089 s++;
Tim Petersced69f82003-09-16 20:30:58 +00001090 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001091 }
1092 else {
1093 *p++ = ch;
1094 s++;
1095 }
1096 continue;
1097 utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001098 outpos = p-PyUnicode_AS_UNICODE(unicode);
1099 endinpos = s-starts;
1100 if (unicode_decode_call_errorhandler(
1101 errors, &errorHandler,
1102 "utf7", errmsg,
1103 starts, size, &startinpos, &endinpos, &exc, &s,
1104 (PyObject **)&unicode, &outpos, &p))
1105 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001106 }
1107
1108 if (inShift) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001109 outpos = p-PyUnicode_AS_UNICODE(unicode);
1110 endinpos = size;
1111 if (unicode_decode_call_errorhandler(
1112 errors, &errorHandler,
1113 "utf7", "unterminated shift sequence",
1114 starts, size, &startinpos, &endinpos, &exc, &s,
1115 (PyObject **)&unicode, &outpos, &p))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001116 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001117 if (s < e)
1118 goto restart;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001119 }
1120
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00001121 if (_PyUnicode_Resize(&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001122 goto onError;
1123
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001124 Py_XDECREF(errorHandler);
1125 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001126 return (PyObject *)unicode;
1127
1128onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001129 Py_XDECREF(errorHandler);
1130 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001131 Py_DECREF(unicode);
1132 return NULL;
1133}
1134
1135
1136PyObject *PyUnicode_EncodeUTF7(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001137 Py_ssize_t size,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001138 int encodeSetO,
1139 int encodeWhiteSpace,
1140 const char *errors)
1141{
1142 PyObject *v;
1143 /* It might be possible to tighten this worst case */
Martin v. Löwis18e16552006-02-15 17:27:45 +00001144 Py_ssize_t cbAllocated = 5 * size;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001145 int inShift = 0;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001146 Py_ssize_t i = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001147 unsigned int bitsleft = 0;
1148 unsigned long charsleft = 0;
1149 char * out;
1150 char * start;
1151
1152 if (size == 0)
1153 return PyString_FromStringAndSize(NULL, 0);
1154
1155 v = PyString_FromStringAndSize(NULL, cbAllocated);
1156 if (v == NULL)
1157 return NULL;
1158
1159 start = out = PyString_AS_STRING(v);
1160 for (;i < size; ++i) {
1161 Py_UNICODE ch = s[i];
1162
1163 if (!inShift) {
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001164 if (ch == '+') {
1165 *out++ = '+';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001166 *out++ = '-';
1167 } else if (SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
1168 charsleft = ch;
1169 bitsleft = 16;
1170 *out++ = '+';
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001171 /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001172 inShift = bitsleft > 0;
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001173 } else {
1174 *out++ = (char) ch;
1175 }
1176 } else {
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001177 if (!SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
1178 *out++ = B64(charsleft << (6-bitsleft));
1179 charsleft = 0;
1180 bitsleft = 0;
1181 /* Characters not in the BASE64 set implicitly unshift the sequence
1182 so no '-' is required, except if the character is itself a '-' */
1183 if (B64CHAR(ch) || ch == '-') {
1184 *out++ = '-';
1185 }
1186 inShift = 0;
1187 *out++ = (char) ch;
1188 } else {
1189 bitsleft += 16;
1190 charsleft = (charsleft << 16) | ch;
1191 /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
1192
1193 /* If the next character is special then we dont' need to terminate
Tim Petersced69f82003-09-16 20:30:58 +00001194 the shift sequence. If the next character is not a BASE64 character
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001195 or '-' then the shift sequence will be terminated implicitly and we
1196 don't have to insert a '-'. */
1197
1198 if (bitsleft == 0) {
1199 if (i + 1 < size) {
1200 Py_UNICODE ch2 = s[i+1];
1201
1202 if (SPECIAL(ch2, encodeSetO, encodeWhiteSpace)) {
Tim Petersced69f82003-09-16 20:30:58 +00001203
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001204 } else if (B64CHAR(ch2) || ch2 == '-') {
1205 *out++ = '-';
1206 inShift = 0;
1207 } else {
1208 inShift = 0;
1209 }
1210
1211 }
1212 else {
1213 *out++ = '-';
1214 inShift = 0;
1215 }
1216 }
Tim Petersced69f82003-09-16 20:30:58 +00001217 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001218 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001219 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001220 if (bitsleft) {
1221 *out++= B64(charsleft << (6-bitsleft) );
1222 *out++ = '-';
1223 }
1224
Tim Peters5de98422002-04-27 18:44:32 +00001225 _PyString_Resize(&v, out - start);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001226 return v;
1227}
1228
1229#undef SPECIAL
1230#undef B64
1231#undef B64CHAR
1232#undef UB64
1233#undef ENCODE
1234#undef DECODE
1235
Guido van Rossumd57fd912000-03-10 22:53:23 +00001236/* --- UTF-8 Codec -------------------------------------------------------- */
1237
Tim Petersced69f82003-09-16 20:30:58 +00001238static
Guido van Rossumd57fd912000-03-10 22:53:23 +00001239char utf8_code_length[256] = {
1240 /* Map UTF-8 encoded prefix byte to sequence length. zero means
1241 illegal prefix. see RFC 2279 for details */
1242 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1243 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1244 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1245 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1246 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1247 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1248 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1249 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1250 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1251 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1252 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1253 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1254 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1255 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1256 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
1257 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0
1258};
1259
Guido van Rossumd57fd912000-03-10 22:53:23 +00001260PyObject *PyUnicode_DecodeUTF8(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001261 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00001262 const char *errors)
1263{
Walter Dörwald69652032004-09-07 20:24:22 +00001264 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
1265}
1266
1267PyObject *PyUnicode_DecodeUTF8Stateful(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001268 Py_ssize_t size,
Walter Dörwald69652032004-09-07 20:24:22 +00001269 const char *errors,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001270 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00001271{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001272 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001273 int n;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001274 Py_ssize_t startinpos;
1275 Py_ssize_t endinpos;
1276 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001277 const char *e;
1278 PyUnicodeObject *unicode;
1279 Py_UNICODE *p;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001280 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001281 PyObject *errorHandler = NULL;
1282 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001283
1284 /* Note: size will always be longer than the resulting Unicode
1285 character count */
1286 unicode = _PyUnicode_New(size);
1287 if (!unicode)
1288 return NULL;
Walter Dörwald69652032004-09-07 20:24:22 +00001289 if (size == 0) {
1290 if (consumed)
1291 *consumed = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001292 return (PyObject *)unicode;
Walter Dörwald69652032004-09-07 20:24:22 +00001293 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001294
1295 /* Unpack UTF-8 encoded data */
1296 p = unicode->str;
1297 e = s + size;
1298
1299 while (s < e) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001300 Py_UCS4 ch = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001301
1302 if (ch < 0x80) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001303 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001304 s++;
1305 continue;
1306 }
1307
1308 n = utf8_code_length[ch];
1309
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001310 if (s + n > e) {
Walter Dörwald69652032004-09-07 20:24:22 +00001311 if (consumed)
1312 break;
1313 else {
1314 errmsg = "unexpected end of data";
1315 startinpos = s-starts;
1316 endinpos = size;
1317 goto utf8Error;
1318 }
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001319 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001320
1321 switch (n) {
1322
1323 case 0:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001324 errmsg = "unexpected code byte";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001325 startinpos = s-starts;
1326 endinpos = startinpos+1;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001327 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001328
1329 case 1:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001330 errmsg = "internal error";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001331 startinpos = s-starts;
1332 endinpos = startinpos+1;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001333 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001334
1335 case 2:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001336 if ((s[1] & 0xc0) != 0x80) {
1337 errmsg = "invalid data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001338 startinpos = s-starts;
1339 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001340 goto utf8Error;
1341 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001342 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001343 if (ch < 0x80) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001344 startinpos = s-starts;
1345 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001346 errmsg = "illegal encoding";
1347 goto utf8Error;
1348 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001349 else
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001350 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001351 break;
1352
1353 case 3:
Tim Petersced69f82003-09-16 20:30:58 +00001354 if ((s[1] & 0xc0) != 0x80 ||
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001355 (s[2] & 0xc0) != 0x80) {
1356 errmsg = "invalid data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001357 startinpos = s-starts;
1358 endinpos = startinpos+3;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001359 goto utf8Error;
1360 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001361 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001362 if (ch < 0x0800) {
1363 /* Note: UTF-8 encodings of surrogates are considered
Tim Petersced69f82003-09-16 20:30:58 +00001364 legal UTF-8 sequences;
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001365
1366 XXX For wide builds (UCS-4) we should probably try
1367 to recombine the surrogates into a single code
1368 unit.
1369 */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001370 errmsg = "illegal encoding";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001371 startinpos = s-starts;
1372 endinpos = startinpos+3;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001373 goto utf8Error;
1374 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001375 else
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001376 *p++ = (Py_UNICODE)ch;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001377 break;
1378
1379 case 4:
1380 if ((s[1] & 0xc0) != 0x80 ||
1381 (s[2] & 0xc0) != 0x80 ||
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001382 (s[3] & 0xc0) != 0x80) {
1383 errmsg = "invalid data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001384 startinpos = s-starts;
1385 endinpos = startinpos+4;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001386 goto utf8Error;
1387 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001388 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
1389 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
1390 /* validate and convert to UTF-16 */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001391 if ((ch < 0x10000) /* minimum value allowed for 4
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001392 byte encoding */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001393 || (ch > 0x10ffff)) /* maximum value allowed for
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001394 UTF-16 */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001395 {
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001396 errmsg = "illegal encoding";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001397 startinpos = s-starts;
1398 endinpos = startinpos+4;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001399 goto utf8Error;
1400 }
Fredrik Lundh8f455852001-06-27 18:59:43 +00001401#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001402 *p++ = (Py_UNICODE)ch;
1403#else
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001404 /* compute and append the two surrogates: */
Tim Petersced69f82003-09-16 20:30:58 +00001405
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001406 /* translate from 10000..10FFFF to 0..FFFF */
1407 ch -= 0x10000;
Tim Petersced69f82003-09-16 20:30:58 +00001408
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001409 /* high surrogate = top 10 bits added to D800 */
1410 *p++ = (Py_UNICODE)(0xD800 + (ch >> 10));
Tim Petersced69f82003-09-16 20:30:58 +00001411
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001412 /* low surrogate = bottom 10 bits added to DC00 */
Fredrik Lundh45714e92001-06-26 16:39:36 +00001413 *p++ = (Py_UNICODE)(0xDC00 + (ch & 0x03FF));
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001414#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00001415 break;
1416
1417 default:
1418 /* Other sizes are only needed for UCS-4 */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001419 errmsg = "unsupported Unicode code range";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001420 startinpos = s-starts;
1421 endinpos = startinpos+n;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001422 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001423 }
1424 s += n;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001425 continue;
Tim Petersced69f82003-09-16 20:30:58 +00001426
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001427 utf8Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001428 outpos = p-PyUnicode_AS_UNICODE(unicode);
1429 if (unicode_decode_call_errorhandler(
1430 errors, &errorHandler,
1431 "utf8", errmsg,
1432 starts, size, &startinpos, &endinpos, &exc, &s,
1433 (PyObject **)&unicode, &outpos, &p))
1434 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001435 }
Walter Dörwald69652032004-09-07 20:24:22 +00001436 if (consumed)
1437 *consumed = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001438
1439 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00001440 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001441 goto onError;
1442
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001443 Py_XDECREF(errorHandler);
1444 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001445 return (PyObject *)unicode;
1446
1447onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001448 Py_XDECREF(errorHandler);
1449 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001450 Py_DECREF(unicode);
1451 return NULL;
1452}
1453
Tim Peters602f7402002-04-27 18:03:26 +00001454/* Allocation strategy: if the string is short, convert into a stack buffer
1455 and allocate exactly as much space needed at the end. Else allocate the
1456 maximum possible needed (4 result bytes per Unicode character), and return
1457 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001458*/
Tim Peters7e3d9612002-04-21 03:26:37 +00001459PyObject *
1460PyUnicode_EncodeUTF8(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001461 Py_ssize_t size,
Tim Peters7e3d9612002-04-21 03:26:37 +00001462 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001463{
Tim Peters602f7402002-04-27 18:03:26 +00001464#define MAX_SHORT_UNICHARS 300 /* largest size we'll do on the stack */
Tim Peters0eca65c2002-04-21 17:28:06 +00001465
Martin v. Löwis18e16552006-02-15 17:27:45 +00001466 Py_ssize_t i; /* index into s of next input byte */
Tim Peters602f7402002-04-27 18:03:26 +00001467 PyObject *v; /* result string object */
1468 char *p; /* next free byte in output buffer */
Martin v. Löwis18e16552006-02-15 17:27:45 +00001469 Py_ssize_t nallocated; /* number of result bytes allocated */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001470 Py_ssize_t nneeded; /* number of result bytes needed */
Tim Peters602f7402002-04-27 18:03:26 +00001471 char stackbuf[MAX_SHORT_UNICHARS * 4];
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001472
Tim Peters602f7402002-04-27 18:03:26 +00001473 assert(s != NULL);
1474 assert(size >= 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001475
Tim Peters602f7402002-04-27 18:03:26 +00001476 if (size <= MAX_SHORT_UNICHARS) {
1477 /* Write into the stack buffer; nallocated can't overflow.
1478 * At the end, we'll allocate exactly as much heap space as it
1479 * turns out we need.
1480 */
1481 nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int);
1482 v = NULL; /* will allocate after we're done */
1483 p = stackbuf;
1484 }
1485 else {
1486 /* Overallocate on the heap, and give the excess back at the end. */
1487 nallocated = size * 4;
1488 if (nallocated / 4 != size) /* overflow! */
1489 return PyErr_NoMemory();
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001490 v = PyBytes_FromStringAndSize(NULL, nallocated);
Tim Peters602f7402002-04-27 18:03:26 +00001491 if (v == NULL)
1492 return NULL;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001493 p = PyBytes_AS_STRING(v);
Tim Peters602f7402002-04-27 18:03:26 +00001494 }
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001495
Tim Peters602f7402002-04-27 18:03:26 +00001496 for (i = 0; i < size;) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001497 Py_UCS4 ch = s[i++];
Marc-André Lemburg3688a882002-02-06 18:09:02 +00001498
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001499 if (ch < 0x80)
Tim Peters602f7402002-04-27 18:03:26 +00001500 /* Encode ASCII */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001501 *p++ = (char) ch;
Marc-André Lemburg3688a882002-02-06 18:09:02 +00001502
Guido van Rossumd57fd912000-03-10 22:53:23 +00001503 else if (ch < 0x0800) {
Tim Peters602f7402002-04-27 18:03:26 +00001504 /* Encode Latin-1 */
Marc-André Lemburgdc724d62002-02-06 18:20:19 +00001505 *p++ = (char)(0xc0 | (ch >> 6));
1506 *p++ = (char)(0x80 | (ch & 0x3f));
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001507 }
Marc-André Lemburg3688a882002-02-06 18:09:02 +00001508 else {
Tim Peters602f7402002-04-27 18:03:26 +00001509 /* Encode UCS2 Unicode ordinals */
1510 if (ch < 0x10000) {
1511 /* Special case: check for high surrogate */
1512 if (0xD800 <= ch && ch <= 0xDBFF && i != size) {
1513 Py_UCS4 ch2 = s[i];
1514 /* Check for low surrogate and combine the two to
1515 form a UCS4 value */
1516 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001517 ch = ((ch - 0xD800) << 10 | (ch2 - 0xDC00)) + 0x10000;
Tim Peters602f7402002-04-27 18:03:26 +00001518 i++;
1519 goto encodeUCS4;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001520 }
Tim Peters602f7402002-04-27 18:03:26 +00001521 /* Fall through: handles isolated high surrogates */
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001522 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001523 *p++ = (char)(0xe0 | (ch >> 12));
Tim Peters602f7402002-04-27 18:03:26 +00001524 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
1525 *p++ = (char)(0x80 | (ch & 0x3f));
1526 continue;
1527 }
1528encodeUCS4:
1529 /* Encode UCS4 Unicode ordinals */
1530 *p++ = (char)(0xf0 | (ch >> 18));
1531 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
1532 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
1533 *p++ = (char)(0x80 | (ch & 0x3f));
1534 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001535 }
Tim Peters0eca65c2002-04-21 17:28:06 +00001536
Tim Peters602f7402002-04-27 18:03:26 +00001537 if (v == NULL) {
1538 /* This was stack allocated. */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001539 nneeded = p - stackbuf;
Tim Peters602f7402002-04-27 18:03:26 +00001540 assert(nneeded <= nallocated);
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001541 v = PyBytes_FromStringAndSize(stackbuf, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00001542 }
1543 else {
1544 /* Cut back to size actually needed. */
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001545 nneeded = p - PyBytes_AS_STRING(v);
Tim Peters602f7402002-04-27 18:03:26 +00001546 assert(nneeded <= nallocated);
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001547 PyBytes_Resize(v, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00001548 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001549 return v;
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001550
Tim Peters602f7402002-04-27 18:03:26 +00001551#undef MAX_SHORT_UNICHARS
Guido van Rossumd57fd912000-03-10 22:53:23 +00001552}
1553
Guido van Rossumd57fd912000-03-10 22:53:23 +00001554PyObject *PyUnicode_AsUTF8String(PyObject *unicode)
1555{
Guido van Rossumd57fd912000-03-10 22:53:23 +00001556 if (!PyUnicode_Check(unicode)) {
1557 PyErr_BadArgument();
1558 return NULL;
1559 }
Barry Warsaw2dd4abf2000-08-18 06:58:15 +00001560 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
1561 PyUnicode_GET_SIZE(unicode),
1562 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001563}
1564
1565/* --- UTF-16 Codec ------------------------------------------------------- */
1566
Tim Peters772747b2001-08-09 22:21:55 +00001567PyObject *
1568PyUnicode_DecodeUTF16(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001569 Py_ssize_t size,
Tim Peters772747b2001-08-09 22:21:55 +00001570 const char *errors,
1571 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001572{
Walter Dörwald69652032004-09-07 20:24:22 +00001573 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
1574}
1575
1576PyObject *
1577PyUnicode_DecodeUTF16Stateful(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001578 Py_ssize_t size,
Walter Dörwald69652032004-09-07 20:24:22 +00001579 const char *errors,
1580 int *byteorder,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001581 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00001582{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001583 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001584 Py_ssize_t startinpos;
1585 Py_ssize_t endinpos;
1586 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001587 PyUnicodeObject *unicode;
1588 Py_UNICODE *p;
Tim Peters772747b2001-08-09 22:21:55 +00001589 const unsigned char *q, *e;
1590 int bo = 0; /* assume native ordering by default */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001591 const char *errmsg = "";
Tim Peters772747b2001-08-09 22:21:55 +00001592 /* Offsets from q for retrieving byte pairs in the right order. */
1593#ifdef BYTEORDER_IS_LITTLE_ENDIAN
1594 int ihi = 1, ilo = 0;
1595#else
1596 int ihi = 0, ilo = 1;
1597#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001598 PyObject *errorHandler = NULL;
1599 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001600
1601 /* Note: size will always be longer than the resulting Unicode
1602 character count */
1603 unicode = _PyUnicode_New(size);
1604 if (!unicode)
1605 return NULL;
1606 if (size == 0)
1607 return (PyObject *)unicode;
1608
1609 /* Unpack UTF-16 encoded data */
1610 p = unicode->str;
Tim Peters772747b2001-08-09 22:21:55 +00001611 q = (unsigned char *)s;
1612 e = q + size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001613
1614 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00001615 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001616
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001617 /* Check for BOM marks (U+FEFF) in the input and adjust current
1618 byte order setting accordingly. In native mode, the leading BOM
1619 mark is skipped, in all other modes, it is copied to the output
1620 stream as-is (giving a ZWNBSP character). */
1621 if (bo == 0) {
Walter Dörwald69652032004-09-07 20:24:22 +00001622 if (size >= 2) {
1623 const Py_UNICODE bom = (q[ihi] << 8) | q[ilo];
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001624#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Walter Dörwald69652032004-09-07 20:24:22 +00001625 if (bom == 0xFEFF) {
1626 q += 2;
1627 bo = -1;
1628 }
1629 else if (bom == 0xFFFE) {
1630 q += 2;
1631 bo = 1;
1632 }
Tim Petersced69f82003-09-16 20:30:58 +00001633#else
Walter Dörwald69652032004-09-07 20:24:22 +00001634 if (bom == 0xFEFF) {
1635 q += 2;
1636 bo = 1;
1637 }
1638 else if (bom == 0xFFFE) {
1639 q += 2;
1640 bo = -1;
1641 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001642#endif
Walter Dörwald69652032004-09-07 20:24:22 +00001643 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001644 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001645
Tim Peters772747b2001-08-09 22:21:55 +00001646 if (bo == -1) {
1647 /* force LE */
1648 ihi = 1;
1649 ilo = 0;
1650 }
1651 else if (bo == 1) {
1652 /* force BE */
1653 ihi = 0;
1654 ilo = 1;
1655 }
1656
1657 while (q < e) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001658 Py_UNICODE ch;
Walter Dörwald69652032004-09-07 20:24:22 +00001659 /* remaining bytes at the end? (size should be even) */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001660 if (e-q<2) {
Walter Dörwald69652032004-09-07 20:24:22 +00001661 if (consumed)
1662 break;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001663 errmsg = "truncated data";
1664 startinpos = ((const char *)q)-starts;
1665 endinpos = ((const char *)e)-starts;
1666 goto utf16Error;
1667 /* The remaining input chars are ignored if the callback
1668 chooses to skip the input */
1669 }
1670 ch = (q[ihi] << 8) | q[ilo];
1671
Tim Peters772747b2001-08-09 22:21:55 +00001672 q += 2;
1673
Guido van Rossumd57fd912000-03-10 22:53:23 +00001674 if (ch < 0xD800 || ch > 0xDFFF) {
1675 *p++ = ch;
1676 continue;
1677 }
1678
1679 /* UTF-16 code pair: */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001680 if (q >= e) {
1681 errmsg = "unexpected end of data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001682 startinpos = (((const char *)q)-2)-starts;
1683 endinpos = ((const char *)e)-starts;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001684 goto utf16Error;
1685 }
Martin v. Löwisac93bc22001-06-26 22:43:40 +00001686 if (0xD800 <= ch && ch <= 0xDBFF) {
Tim Peters772747b2001-08-09 22:21:55 +00001687 Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo];
1688 q += 2;
Martin v. Löwisac93bc22001-06-26 22:43:40 +00001689 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Fredrik Lundh8f455852001-06-27 18:59:43 +00001690#ifndef Py_UNICODE_WIDE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001691 *p++ = ch;
1692 *p++ = ch2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001693#else
1694 *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001695#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001696 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001697 }
1698 else {
1699 errmsg = "illegal UTF-16 surrogate";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001700 startinpos = (((const char *)q)-4)-starts;
1701 endinpos = startinpos+2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001702 goto utf16Error;
1703 }
1704
Guido van Rossumd57fd912000-03-10 22:53:23 +00001705 }
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001706 errmsg = "illegal encoding";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001707 startinpos = (((const char *)q)-2)-starts;
1708 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001709 /* Fall through to report the error */
1710
1711 utf16Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001712 outpos = p-PyUnicode_AS_UNICODE(unicode);
1713 if (unicode_decode_call_errorhandler(
1714 errors, &errorHandler,
1715 "utf16", errmsg,
1716 starts, size, &startinpos, &endinpos, &exc, (const char **)&q,
1717 (PyObject **)&unicode, &outpos, &p))
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001718 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001719 }
1720
1721 if (byteorder)
1722 *byteorder = bo;
1723
Walter Dörwald69652032004-09-07 20:24:22 +00001724 if (consumed)
1725 *consumed = (const char *)q-starts;
1726
Guido van Rossumd57fd912000-03-10 22:53:23 +00001727 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00001728 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001729 goto onError;
1730
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001731 Py_XDECREF(errorHandler);
1732 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001733 return (PyObject *)unicode;
1734
1735onError:
1736 Py_DECREF(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001737 Py_XDECREF(errorHandler);
1738 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001739 return NULL;
1740}
1741
Tim Peters772747b2001-08-09 22:21:55 +00001742PyObject *
1743PyUnicode_EncodeUTF16(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001744 Py_ssize_t size,
Tim Peters772747b2001-08-09 22:21:55 +00001745 const char *errors,
1746 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001747{
1748 PyObject *v;
Tim Peters772747b2001-08-09 22:21:55 +00001749 unsigned char *p;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00001750#ifdef Py_UNICODE_WIDE
Tim Peters772747b2001-08-09 22:21:55 +00001751 int i, pairs;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00001752#else
1753 const int pairs = 0;
1754#endif
Tim Peters772747b2001-08-09 22:21:55 +00001755 /* Offsets from p for storing byte pairs in the right order. */
1756#ifdef BYTEORDER_IS_LITTLE_ENDIAN
1757 int ihi = 1, ilo = 0;
1758#else
1759 int ihi = 0, ilo = 1;
1760#endif
1761
1762#define STORECHAR(CH) \
1763 do { \
1764 p[ihi] = ((CH) >> 8) & 0xff; \
1765 p[ilo] = (CH) & 0xff; \
1766 p += 2; \
1767 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001768
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00001769#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001770 for (i = pairs = 0; i < size; i++)
1771 if (s[i] >= 0x10000)
1772 pairs++;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00001773#endif
Walter Dörwald3cc34522007-05-04 10:48:27 +00001774 v = PyBytes_FromStringAndSize(NULL,
Tim Peters772747b2001-08-09 22:21:55 +00001775 2 * (size + pairs + (byteorder == 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +00001776 if (v == NULL)
1777 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001778
Walter Dörwald3cc34522007-05-04 10:48:27 +00001779 p = (unsigned char *)PyBytes_AS_STRING(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001780 if (byteorder == 0)
Tim Peters772747b2001-08-09 22:21:55 +00001781 STORECHAR(0xFEFF);
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00001782 if (size == 0)
Marc-André Lemburgb7520772000-08-14 11:29:19 +00001783 return v;
Tim Peters772747b2001-08-09 22:21:55 +00001784
1785 if (byteorder == -1) {
1786 /* force LE */
1787 ihi = 1;
1788 ilo = 0;
1789 }
1790 else if (byteorder == 1) {
1791 /* force BE */
1792 ihi = 0;
1793 ilo = 1;
1794 }
1795
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001796 while (size-- > 0) {
1797 Py_UNICODE ch = *s++;
1798 Py_UNICODE ch2 = 0;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00001799#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001800 if (ch >= 0x10000) {
Tim Peters772747b2001-08-09 22:21:55 +00001801 ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF);
1802 ch = 0xD800 | ((ch-0x10000) >> 10);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001803 }
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00001804#endif
Tim Peters772747b2001-08-09 22:21:55 +00001805 STORECHAR(ch);
1806 if (ch2)
1807 STORECHAR(ch2);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001808 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001809 return v;
Tim Peters772747b2001-08-09 22:21:55 +00001810#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00001811}
1812
1813PyObject *PyUnicode_AsUTF16String(PyObject *unicode)
1814{
1815 if (!PyUnicode_Check(unicode)) {
1816 PyErr_BadArgument();
1817 return NULL;
1818 }
1819 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
1820 PyUnicode_GET_SIZE(unicode),
1821 NULL,
1822 0);
1823}
1824
1825/* --- Unicode Escape Codec ----------------------------------------------- */
1826
Fredrik Lundh06d12682001-01-24 07:59:11 +00001827static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00001828
Guido van Rossumd57fd912000-03-10 22:53:23 +00001829PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001830 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00001831 const char *errors)
1832{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001833 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001834 Py_ssize_t startinpos;
1835 Py_ssize_t endinpos;
1836 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001837 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001838 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001839 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001840 const char *end;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001841 char* message;
1842 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001843 PyObject *errorHandler = NULL;
1844 PyObject *exc = NULL;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001845
Guido van Rossumd57fd912000-03-10 22:53:23 +00001846 /* Escaped strings will always be longer than the resulting
1847 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001848 length after conversion to the true value.
1849 (but if the error callback returns a long replacement string
1850 we'll have to allocate more space) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001851 v = _PyUnicode_New(size);
1852 if (v == NULL)
1853 goto onError;
1854 if (size == 0)
1855 return (PyObject *)v;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001856
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001857 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001858 end = s + size;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001859
Guido van Rossumd57fd912000-03-10 22:53:23 +00001860 while (s < end) {
1861 unsigned char c;
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00001862 Py_UNICODE x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001863 int digits;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001864
1865 /* Non-escape characters are interpreted as Unicode ordinals */
1866 if (*s != '\\') {
Fredrik Lundhccc74732001-02-18 22:13:49 +00001867 *p++ = (unsigned char) *s++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001868 continue;
1869 }
1870
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001871 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001872 /* \ - Escapes */
1873 s++;
1874 switch (*s++) {
1875
1876 /* \x escapes */
1877 case '\n': break;
1878 case '\\': *p++ = '\\'; break;
1879 case '\'': *p++ = '\''; break;
1880 case '\"': *p++ = '\"'; break;
1881 case 'b': *p++ = '\b'; break;
1882 case 'f': *p++ = '\014'; break; /* FF */
1883 case 't': *p++ = '\t'; break;
1884 case 'n': *p++ = '\n'; break;
1885 case 'r': *p++ = '\r'; break;
1886 case 'v': *p++ = '\013'; break; /* VT */
1887 case 'a': *p++ = '\007'; break; /* BEL, not classic C */
1888
1889 /* \OOO (octal) escapes */
1890 case '0': case '1': case '2': case '3':
1891 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001892 x = s[-1] - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00001893 if ('0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001894 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00001895 if ('0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001896 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00001897 }
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001898 *p++ = x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001899 break;
1900
Fredrik Lundhccc74732001-02-18 22:13:49 +00001901 /* hex escapes */
1902 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001903 case 'x':
Fredrik Lundhccc74732001-02-18 22:13:49 +00001904 digits = 2;
1905 message = "truncated \\xXX escape";
1906 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001907
Fredrik Lundhccc74732001-02-18 22:13:49 +00001908 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001909 case 'u':
Fredrik Lundhccc74732001-02-18 22:13:49 +00001910 digits = 4;
1911 message = "truncated \\uXXXX escape";
1912 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001913
Fredrik Lundhccc74732001-02-18 22:13:49 +00001914 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00001915 case 'U':
Fredrik Lundhccc74732001-02-18 22:13:49 +00001916 digits = 8;
1917 message = "truncated \\UXXXXXXXX escape";
1918 hexescape:
1919 chr = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001920 outpos = p-PyUnicode_AS_UNICODE(v);
1921 if (s+digits>end) {
1922 endinpos = size;
1923 if (unicode_decode_call_errorhandler(
1924 errors, &errorHandler,
1925 "unicodeescape", "end of string in escape sequence",
1926 starts, size, &startinpos, &endinpos, &exc, &s,
1927 (PyObject **)&v, &outpos, &p))
1928 goto onError;
1929 goto nextByte;
1930 }
1931 for (i = 0; i < digits; ++i) {
Fredrik Lundhccc74732001-02-18 22:13:49 +00001932 c = (unsigned char) s[i];
Fredrik Lundhdf846752000-09-03 11:29:49 +00001933 if (!isxdigit(c)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001934 endinpos = (s+i+1)-starts;
1935 if (unicode_decode_call_errorhandler(
1936 errors, &errorHandler,
1937 "unicodeescape", message,
1938 starts, size, &startinpos, &endinpos, &exc, &s,
1939 (PyObject **)&v, &outpos, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00001940 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001941 goto nextByte;
Fredrik Lundhdf846752000-09-03 11:29:49 +00001942 }
1943 chr = (chr<<4) & ~0xF;
1944 if (c >= '0' && c <= '9')
1945 chr += c - '0';
1946 else if (c >= 'a' && c <= 'f')
1947 chr += 10 + c - 'a';
1948 else
1949 chr += 10 + c - 'A';
1950 }
1951 s += i;
Jeremy Hylton504de6b2003-10-06 05:08:26 +00001952 if (chr == 0xffffffff && PyErr_Occurred())
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001953 /* _decoding_error will have already written into the
1954 target buffer. */
1955 break;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001956 store:
Fredrik Lundhdf846752000-09-03 11:29:49 +00001957 /* when we get here, chr is a 32-bit unicode character */
1958 if (chr <= 0xffff)
1959 /* UCS-2 character */
1960 *p++ = (Py_UNICODE) chr;
1961 else if (chr <= 0x10ffff) {
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001962 /* UCS-4 character. Either store directly, or as
Walter Dörwald8c077222002-03-25 11:16:18 +00001963 surrogate pair. */
Fredrik Lundh8f455852001-06-27 18:59:43 +00001964#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001965 *p++ = chr;
1966#else
Fredrik Lundhdf846752000-09-03 11:29:49 +00001967 chr -= 0x10000L;
1968 *p++ = 0xD800 + (Py_UNICODE) (chr >> 10);
Fredrik Lundh45714e92001-06-26 16:39:36 +00001969 *p++ = 0xDC00 + (Py_UNICODE) (chr & 0x03FF);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001970#endif
Fredrik Lundhdf846752000-09-03 11:29:49 +00001971 } else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001972 endinpos = s-starts;
1973 outpos = p-PyUnicode_AS_UNICODE(v);
1974 if (unicode_decode_call_errorhandler(
1975 errors, &errorHandler,
1976 "unicodeescape", "illegal Unicode character",
1977 starts, size, &startinpos, &endinpos, &exc, &s,
1978 (PyObject **)&v, &outpos, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00001979 goto onError;
1980 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00001981 break;
1982
1983 /* \N{name} */
1984 case 'N':
1985 message = "malformed \\N character escape";
1986 if (ucnhash_CAPI == NULL) {
1987 /* load the unicode data module */
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00001988 PyObject *m, *api;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001989 m = PyImport_ImportModule("unicodedata");
1990 if (m == NULL)
1991 goto ucnhashError;
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00001992 api = PyObject_GetAttrString(m, "ucnhash_CAPI");
Fredrik Lundhccc74732001-02-18 22:13:49 +00001993 Py_DECREF(m);
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00001994 if (api == NULL)
Fredrik Lundhccc74732001-02-18 22:13:49 +00001995 goto ucnhashError;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001996 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCObject_AsVoidPtr(api);
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00001997 Py_DECREF(api);
Fredrik Lundhccc74732001-02-18 22:13:49 +00001998 if (ucnhash_CAPI == NULL)
1999 goto ucnhashError;
2000 }
2001 if (*s == '{') {
2002 const char *start = s+1;
2003 /* look for the closing brace */
2004 while (*s != '}' && s < end)
2005 s++;
2006 if (s > start && s < end && *s == '}') {
2007 /* found a name. look it up in the unicode database */
2008 message = "unknown Unicode character name";
2009 s++;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +00002010 if (ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1), &chr))
Fredrik Lundhccc74732001-02-18 22:13:49 +00002011 goto store;
2012 }
2013 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002014 endinpos = s-starts;
2015 outpos = p-PyUnicode_AS_UNICODE(v);
2016 if (unicode_decode_call_errorhandler(
2017 errors, &errorHandler,
2018 "unicodeescape", message,
2019 starts, size, &startinpos, &endinpos, &exc, &s,
2020 (PyObject **)&v, &outpos, &p))
Fredrik Lundhccc74732001-02-18 22:13:49 +00002021 goto onError;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002022 break;
2023
2024 default:
Walter Dörwald8c077222002-03-25 11:16:18 +00002025 if (s > end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002026 message = "\\ at end of string";
2027 s--;
2028 endinpos = s-starts;
2029 outpos = p-PyUnicode_AS_UNICODE(v);
2030 if (unicode_decode_call_errorhandler(
2031 errors, &errorHandler,
2032 "unicodeescape", message,
2033 starts, size, &startinpos, &endinpos, &exc, &s,
2034 (PyObject **)&v, &outpos, &p))
Walter Dörwald8c077222002-03-25 11:16:18 +00002035 goto onError;
2036 }
2037 else {
2038 *p++ = '\\';
2039 *p++ = (unsigned char)s[-1];
2040 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00002041 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002042 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002043 nextByte:
2044 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002045 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002046 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002047 goto onError;
Walter Dörwaldd4ade082003-08-15 15:00:26 +00002048 Py_XDECREF(errorHandler);
2049 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002050 return (PyObject *)v;
Walter Dörwald8c077222002-03-25 11:16:18 +00002051
Fredrik Lundhccc74732001-02-18 22:13:49 +00002052ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00002053 PyErr_SetString(
2054 PyExc_UnicodeError,
2055 "\\N escapes not supported (can't load unicodedata module)"
2056 );
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00002057 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002058 Py_XDECREF(errorHandler);
2059 Py_XDECREF(exc);
Fredrik Lundhf6056062001-01-20 11:15:25 +00002060 return NULL;
2061
Fredrik Lundhccc74732001-02-18 22:13:49 +00002062onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00002063 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002064 Py_XDECREF(errorHandler);
2065 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002066 return NULL;
2067}
2068
2069/* Return a Unicode-Escape string version of the Unicode object.
2070
2071 If quotes is true, the string is enclosed in u"" or u'' quotes as
2072 appropriate.
2073
2074*/
2075
Thomas Wouters477c8d52006-05-27 19:21:47 +00002076Py_LOCAL_INLINE(const Py_UNICODE *) findchar(const Py_UNICODE *s,
2077 Py_ssize_t size,
2078 Py_UNICODE ch)
2079{
2080 /* like wcschr, but doesn't stop at NULL characters */
2081
2082 while (size-- > 0) {
2083 if (*s == ch)
2084 return s;
2085 s++;
2086 }
2087
2088 return NULL;
2089}
Barry Warsaw51ac5802000-03-20 16:36:48 +00002090
Guido van Rossumd57fd912000-03-10 22:53:23 +00002091static
2092PyObject *unicodeescape_string(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002093 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00002094 int quotes)
2095{
2096 PyObject *repr;
2097 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002098
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00002099 static const char *hexdigit = "0123456789abcdef";
Guido van Rossumd57fd912000-03-10 22:53:23 +00002100
Thomas Wouters89f507f2006-12-13 04:49:30 +00002101 /* XXX(nnorwitz): rather than over-allocating, it would be
2102 better to choose a different scheme. Perhaps scan the
2103 first N-chars of the string and allocate based on that size.
2104 */
2105 /* Initial allocation is based on the longest-possible unichr
2106 escape.
2107
2108 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
2109 unichr, so in this case it's the longest unichr escape. In
2110 narrow (UTF-16) builds this is five chars per source unichr
2111 since there are two unichrs in the surrogate pair, so in narrow
2112 (UTF-16) builds it's not the longest unichr escape.
2113
2114 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
2115 so in the narrow (UTF-16) build case it's the longest unichr
2116 escape.
2117 */
2118
2119 repr = PyString_FromStringAndSize(NULL,
2120 2
2121#ifdef Py_UNICODE_WIDE
2122 + 10*size
2123#else
2124 + 6*size
2125#endif
2126 + 1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002127 if (repr == NULL)
2128 return NULL;
2129
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002130 p = PyString_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002131
2132 if (quotes) {
Tim Petersced69f82003-09-16 20:30:58 +00002133 *p++ = (findchar(s, size, '\'') &&
Guido van Rossumd57fd912000-03-10 22:53:23 +00002134 !findchar(s, size, '"')) ? '"' : '\'';
2135 }
2136 while (size-- > 0) {
2137 Py_UNICODE ch = *s++;
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002138
Hye-Shik Chang835b2432005-12-17 04:38:31 +00002139 /* Escape quotes and backslashes */
2140 if ((quotes &&
Guido van Rossum572dbf82007-04-27 23:53:51 +00002141 ch == (Py_UNICODE) PyString_AS_STRING(repr)[0]) || ch == '\\') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002142 *p++ = '\\';
2143 *p++ = (char) ch;
Guido van Rossumad9744a2001-09-21 15:38:17 +00002144 continue;
Tim Petersced69f82003-09-16 20:30:58 +00002145 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002146
Guido van Rossum0d42e0c2001-07-20 16:36:21 +00002147#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002148 /* Map 21-bit characters to '\U00xxxxxx' */
2149 else if (ch >= 0x10000) {
2150 *p++ = '\\';
2151 *p++ = 'U';
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002152 *p++ = hexdigit[(ch >> 28) & 0x0000000F];
2153 *p++ = hexdigit[(ch >> 24) & 0x0000000F];
2154 *p++ = hexdigit[(ch >> 20) & 0x0000000F];
2155 *p++ = hexdigit[(ch >> 16) & 0x0000000F];
2156 *p++ = hexdigit[(ch >> 12) & 0x0000000F];
2157 *p++ = hexdigit[(ch >> 8) & 0x0000000F];
2158 *p++ = hexdigit[(ch >> 4) & 0x0000000F];
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002159 *p++ = hexdigit[ch & 0x0000000F];
2160 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002161 }
Thomas Wouters89f507f2006-12-13 04:49:30 +00002162#else
2163 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002164 else if (ch >= 0xD800 && ch < 0xDC00) {
2165 Py_UNICODE ch2;
2166 Py_UCS4 ucs;
Tim Petersced69f82003-09-16 20:30:58 +00002167
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002168 ch2 = *s++;
2169 size--;
2170 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
2171 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
2172 *p++ = '\\';
2173 *p++ = 'U';
2174 *p++ = hexdigit[(ucs >> 28) & 0x0000000F];
2175 *p++ = hexdigit[(ucs >> 24) & 0x0000000F];
2176 *p++ = hexdigit[(ucs >> 20) & 0x0000000F];
2177 *p++ = hexdigit[(ucs >> 16) & 0x0000000F];
2178 *p++ = hexdigit[(ucs >> 12) & 0x0000000F];
2179 *p++ = hexdigit[(ucs >> 8) & 0x0000000F];
2180 *p++ = hexdigit[(ucs >> 4) & 0x0000000F];
2181 *p++ = hexdigit[ucs & 0x0000000F];
2182 continue;
2183 }
2184 /* Fall through: isolated surrogates are copied as-is */
2185 s--;
2186 size++;
2187 }
Thomas Wouters89f507f2006-12-13 04:49:30 +00002188#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002189
Guido van Rossumd57fd912000-03-10 22:53:23 +00002190 /* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002191 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002192 *p++ = '\\';
2193 *p++ = 'u';
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002194 *p++ = hexdigit[(ch >> 12) & 0x000F];
2195 *p++ = hexdigit[(ch >> 8) & 0x000F];
2196 *p++ = hexdigit[(ch >> 4) & 0x000F];
2197 *p++ = hexdigit[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00002198 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002199
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00002200 /* Map special whitespace to '\t', \n', '\r' */
2201 else if (ch == '\t') {
2202 *p++ = '\\';
2203 *p++ = 't';
2204 }
2205 else if (ch == '\n') {
2206 *p++ = '\\';
2207 *p++ = 'n';
2208 }
2209 else if (ch == '\r') {
2210 *p++ = '\\';
2211 *p++ = 'r';
2212 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002213
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00002214 /* Map non-printable US ASCII to '\xhh' */
Marc-André Lemburg11326de2001-11-28 12:56:20 +00002215 else if (ch < ' ' || ch >= 0x7F) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002216 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00002217 *p++ = 'x';
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002218 *p++ = hexdigit[(ch >> 4) & 0x000F];
2219 *p++ = hexdigit[ch & 0x000F];
Tim Petersced69f82003-09-16 20:30:58 +00002220 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002221
Guido van Rossumd57fd912000-03-10 22:53:23 +00002222 /* Copy everything else as-is */
2223 else
2224 *p++ = (char) ch;
2225 }
2226 if (quotes)
Guido van Rossum572dbf82007-04-27 23:53:51 +00002227 *p++ = PyString_AS_STRING(repr)[0];
Guido van Rossumd57fd912000-03-10 22:53:23 +00002228
2229 *p = '\0';
Tim Peters5de98422002-04-27 18:44:32 +00002230 _PyString_Resize(&repr, p - PyString_AS_STRING(repr));
Guido van Rossumd57fd912000-03-10 22:53:23 +00002231 return repr;
2232}
2233
2234PyObject *PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002235 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002236{
2237 return unicodeescape_string(s, size, 0);
2238}
2239
2240PyObject *PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
2241{
2242 if (!PyUnicode_Check(unicode)) {
2243 PyErr_BadArgument();
2244 return NULL;
2245 }
2246 return PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
2247 PyUnicode_GET_SIZE(unicode));
2248}
2249
2250/* --- Raw Unicode Escape Codec ------------------------------------------- */
2251
2252PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002253 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00002254 const char *errors)
2255{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002256 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002257 Py_ssize_t startinpos;
2258 Py_ssize_t endinpos;
2259 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002260 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002261 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002262 const char *end;
2263 const char *bs;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002264 PyObject *errorHandler = NULL;
2265 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00002266
Guido van Rossumd57fd912000-03-10 22:53:23 +00002267 /* Escaped strings will always be longer than the resulting
2268 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002269 length after conversion to the true value. (But decoding error
2270 handler might have to resize the string) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002271 v = _PyUnicode_New(size);
2272 if (v == NULL)
2273 goto onError;
2274 if (size == 0)
2275 return (PyObject *)v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002276 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002277 end = s + size;
2278 while (s < end) {
2279 unsigned char c;
Martin v. Löwis047c05e2002-03-21 08:55:28 +00002280 Py_UCS4 x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002281 int i;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002282 int count;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002283
2284 /* Non-escape characters are interpreted as Unicode ordinals */
2285 if (*s != '\\') {
2286 *p++ = (unsigned char)*s++;
2287 continue;
2288 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002289 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002290
2291 /* \u-escapes are only interpreted iff the number of leading
2292 backslashes if odd */
2293 bs = s;
2294 for (;s < end;) {
2295 if (*s != '\\')
2296 break;
2297 *p++ = (unsigned char)*s++;
2298 }
2299 if (((s - bs) & 1) == 0 ||
2300 s >= end ||
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002301 (*s != 'u' && *s != 'U')) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002302 continue;
2303 }
2304 p--;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002305 count = *s=='u' ? 4 : 8;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002306 s++;
2307
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002308 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002309 outpos = p-PyUnicode_AS_UNICODE(v);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002310 for (x = 0, i = 0; i < count; ++i, ++s) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002311 c = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002312 if (!isxdigit(c)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002313 endinpos = s-starts;
2314 if (unicode_decode_call_errorhandler(
2315 errors, &errorHandler,
2316 "rawunicodeescape", "truncated \\uXXXX",
2317 starts, size, &startinpos, &endinpos, &exc, &s,
2318 (PyObject **)&v, &outpos, &p))
Guido van Rossumd57fd912000-03-10 22:53:23 +00002319 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002320 goto nextByte;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002321 }
2322 x = (x<<4) & ~0xF;
2323 if (c >= '0' && c <= '9')
2324 x += c - '0';
2325 else if (c >= 'a' && c <= 'f')
2326 x += 10 + c - 'a';
2327 else
2328 x += 10 + c - 'A';
2329 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002330#ifndef Py_UNICODE_WIDE
2331 if (x > 0x10000) {
2332 if (unicode_decode_call_errorhandler(
2333 errors, &errorHandler,
2334 "rawunicodeescape", "\\Uxxxxxxxx out of range",
2335 starts, size, &startinpos, &endinpos, &exc, &s,
2336 (PyObject **)&v, &outpos, &p))
2337 goto onError;
2338 }
2339#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002340 *p++ = x;
2341 nextByte:
2342 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002343 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002344 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002345 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002346 Py_XDECREF(errorHandler);
2347 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002348 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00002349
Guido van Rossumd57fd912000-03-10 22:53:23 +00002350 onError:
2351 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002352 Py_XDECREF(errorHandler);
2353 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002354 return NULL;
2355}
2356
2357PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002358 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002359{
2360 PyObject *repr;
2361 char *p;
2362 char *q;
2363
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00002364 static const char *hexdigit = "0123456789abcdef";
Guido van Rossumd57fd912000-03-10 22:53:23 +00002365
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002366#ifdef Py_UNICODE_WIDE
2367 repr = PyString_FromStringAndSize(NULL, 10 * size);
2368#else
Guido van Rossumd57fd912000-03-10 22:53:23 +00002369 repr = PyString_FromStringAndSize(NULL, 6 * size);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002370#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00002371 if (repr == NULL)
2372 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00002373 if (size == 0)
2374 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002375
2376 p = q = PyString_AS_STRING(repr);
2377 while (size-- > 0) {
2378 Py_UNICODE ch = *s++;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002379#ifdef Py_UNICODE_WIDE
2380 /* Map 32-bit characters to '\Uxxxxxxxx' */
2381 if (ch >= 0x10000) {
2382 *p++ = '\\';
2383 *p++ = 'U';
2384 *p++ = hexdigit[(ch >> 28) & 0xf];
2385 *p++ = hexdigit[(ch >> 24) & 0xf];
2386 *p++ = hexdigit[(ch >> 20) & 0xf];
2387 *p++ = hexdigit[(ch >> 16) & 0xf];
2388 *p++ = hexdigit[(ch >> 12) & 0xf];
2389 *p++ = hexdigit[(ch >> 8) & 0xf];
2390 *p++ = hexdigit[(ch >> 4) & 0xf];
2391 *p++ = hexdigit[ch & 15];
Tim Petersced69f82003-09-16 20:30:58 +00002392 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002393 else
2394#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00002395 /* Map 16-bit characters to '\uxxxx' */
2396 if (ch >= 256) {
2397 *p++ = '\\';
2398 *p++ = 'u';
2399 *p++ = hexdigit[(ch >> 12) & 0xf];
2400 *p++ = hexdigit[(ch >> 8) & 0xf];
2401 *p++ = hexdigit[(ch >> 4) & 0xf];
2402 *p++ = hexdigit[ch & 15];
2403 }
2404 /* Copy everything else as-is */
2405 else
2406 *p++ = (char) ch;
2407 }
2408 *p = '\0';
Tim Peters5de98422002-04-27 18:44:32 +00002409 _PyString_Resize(&repr, p - q);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002410 return repr;
2411}
2412
2413PyObject *PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
2414{
2415 if (!PyUnicode_Check(unicode)) {
2416 PyErr_BadArgument();
2417 return NULL;
2418 }
2419 return PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
2420 PyUnicode_GET_SIZE(unicode));
2421}
2422
Walter Dörwalda47d1c02005-08-30 10:23:14 +00002423/* --- Unicode Internal Codec ------------------------------------------- */
2424
2425PyObject *_PyUnicode_DecodeUnicodeInternal(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002426 Py_ssize_t size,
Walter Dörwalda47d1c02005-08-30 10:23:14 +00002427 const char *errors)
2428{
2429 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002430 Py_ssize_t startinpos;
2431 Py_ssize_t endinpos;
2432 Py_ssize_t outpos;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00002433 PyUnicodeObject *v;
2434 Py_UNICODE *p;
2435 const char *end;
2436 const char *reason;
2437 PyObject *errorHandler = NULL;
2438 PyObject *exc = NULL;
2439
Neal Norwitzd43069c2006-01-08 01:12:10 +00002440#ifdef Py_UNICODE_WIDE
2441 Py_UNICODE unimax = PyUnicode_GetMax();
2442#endif
2443
Thomas Wouters89f507f2006-12-13 04:49:30 +00002444 /* XXX overflow detection missing */
Walter Dörwalda47d1c02005-08-30 10:23:14 +00002445 v = _PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE);
2446 if (v == NULL)
2447 goto onError;
2448 if (PyUnicode_GetSize((PyObject *)v) == 0)
2449 return (PyObject *)v;
2450 p = PyUnicode_AS_UNICODE(v);
2451 end = s + size;
2452
2453 while (s < end) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00002454 memcpy(p, s, sizeof(Py_UNICODE));
Walter Dörwalda47d1c02005-08-30 10:23:14 +00002455 /* We have to sanity check the raw data, otherwise doom looms for
2456 some malformed UCS-4 data. */
2457 if (
2458 #ifdef Py_UNICODE_WIDE
2459 *p > unimax || *p < 0 ||
2460 #endif
2461 end-s < Py_UNICODE_SIZE
2462 )
2463 {
2464 startinpos = s - starts;
2465 if (end-s < Py_UNICODE_SIZE) {
2466 endinpos = end-starts;
2467 reason = "truncated input";
2468 }
2469 else {
2470 endinpos = s - starts + Py_UNICODE_SIZE;
2471 reason = "illegal code point (> 0x10FFFF)";
2472 }
2473 outpos = p - PyUnicode_AS_UNICODE(v);
2474 if (unicode_decode_call_errorhandler(
2475 errors, &errorHandler,
2476 "unicode_internal", reason,
2477 starts, size, &startinpos, &endinpos, &exc, &s,
2478 (PyObject **)&v, &outpos, &p)) {
2479 goto onError;
2480 }
2481 }
2482 else {
2483 p++;
2484 s += Py_UNICODE_SIZE;
2485 }
2486 }
2487
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002488 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00002489 goto onError;
2490 Py_XDECREF(errorHandler);
2491 Py_XDECREF(exc);
2492 return (PyObject *)v;
2493
2494 onError:
2495 Py_XDECREF(v);
2496 Py_XDECREF(errorHandler);
2497 Py_XDECREF(exc);
2498 return NULL;
2499}
2500
Guido van Rossumd57fd912000-03-10 22:53:23 +00002501/* --- Latin-1 Codec ------------------------------------------------------ */
2502
2503PyObject *PyUnicode_DecodeLatin1(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002504 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00002505 const char *errors)
2506{
2507 PyUnicodeObject *v;
2508 Py_UNICODE *p;
Tim Petersced69f82003-09-16 20:30:58 +00002509
Guido van Rossumd57fd912000-03-10 22:53:23 +00002510 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002511 if (size == 1) {
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002512 Py_UNICODE r = *(unsigned char*)s;
2513 return PyUnicode_FromUnicode(&r, 1);
2514 }
2515
Guido van Rossumd57fd912000-03-10 22:53:23 +00002516 v = _PyUnicode_New(size);
2517 if (v == NULL)
2518 goto onError;
2519 if (size == 0)
2520 return (PyObject *)v;
2521 p = PyUnicode_AS_UNICODE(v);
2522 while (size-- > 0)
2523 *p++ = (unsigned char)*s++;
2524 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00002525
Guido van Rossumd57fd912000-03-10 22:53:23 +00002526 onError:
2527 Py_XDECREF(v);
2528 return NULL;
2529}
2530
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002531/* create or adjust a UnicodeEncodeError */
2532static void make_encode_exception(PyObject **exceptionObject,
2533 const char *encoding,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002534 const Py_UNICODE *unicode, Py_ssize_t size,
2535 Py_ssize_t startpos, Py_ssize_t endpos,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002536 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002537{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002538 if (*exceptionObject == NULL) {
2539 *exceptionObject = PyUnicodeEncodeError_Create(
2540 encoding, unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002541 }
2542 else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002543 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
2544 goto onError;
2545 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
2546 goto onError;
2547 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
2548 goto onError;
2549 return;
2550 onError:
2551 Py_DECREF(*exceptionObject);
2552 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002553 }
2554}
2555
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002556/* raises a UnicodeEncodeError */
2557static void raise_encode_exception(PyObject **exceptionObject,
2558 const char *encoding,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002559 const Py_UNICODE *unicode, Py_ssize_t size,
2560 Py_ssize_t startpos, Py_ssize_t endpos,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002561 const char *reason)
2562{
2563 make_encode_exception(exceptionObject,
2564 encoding, unicode, size, startpos, endpos, reason);
2565 if (*exceptionObject != NULL)
2566 PyCodec_StrictErrors(*exceptionObject);
2567}
2568
2569/* error handling callback helper:
2570 build arguments, call the callback and check the arguments,
2571 put the result into newpos and return the replacement string, which
2572 has to be freed by the caller */
2573static PyObject *unicode_encode_call_errorhandler(const char *errors,
2574 PyObject **errorHandler,
2575 const char *encoding, const char *reason,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002576 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
2577 Py_ssize_t startpos, Py_ssize_t endpos,
2578 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002579{
Martin v. Löwis18e16552006-02-15 17:27:45 +00002580 static char *argparse = "O!n;encoding error handler must return (unicode, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002581
2582 PyObject *restuple;
2583 PyObject *resunicode;
2584
2585 if (*errorHandler == NULL) {
2586 *errorHandler = PyCodec_LookupError(errors);
2587 if (*errorHandler == NULL)
2588 return NULL;
2589 }
2590
2591 make_encode_exception(exceptionObject,
2592 encoding, unicode, size, startpos, endpos, reason);
2593 if (*exceptionObject == NULL)
2594 return NULL;
2595
2596 restuple = PyObject_CallFunctionObjArgs(
2597 *errorHandler, *exceptionObject, NULL);
2598 if (restuple == NULL)
2599 return NULL;
2600 if (!PyTuple_Check(restuple)) {
2601 PyErr_Format(PyExc_TypeError, &argparse[4]);
2602 Py_DECREF(restuple);
2603 return NULL;
2604 }
2605 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
2606 &resunicode, newpos)) {
2607 Py_DECREF(restuple);
2608 return NULL;
2609 }
2610 if (*newpos<0)
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00002611 *newpos = size+*newpos;
2612 if (*newpos<0 || *newpos>size) {
Martin v. Löwis2c95cc62006-02-16 06:54:25 +00002613 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00002614 Py_DECREF(restuple);
2615 return NULL;
2616 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002617 Py_INCREF(resunicode);
2618 Py_DECREF(restuple);
2619 return resunicode;
2620}
2621
2622static PyObject *unicode_encode_ucs1(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002623 Py_ssize_t size,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002624 const char *errors,
2625 int limit)
2626{
2627 /* output object */
2628 PyObject *res;
2629 /* pointers to the beginning and end+1 of input */
2630 const Py_UNICODE *startp = p;
2631 const Py_UNICODE *endp = p + size;
2632 /* pointer to the beginning of the unencodable characters */
2633 /* const Py_UNICODE *badp = NULL; */
2634 /* pointer into the output */
2635 char *str;
2636 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00002637 Py_ssize_t respos = 0;
2638 Py_ssize_t ressize;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002639 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
2640 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002641 PyObject *errorHandler = NULL;
2642 PyObject *exc = NULL;
2643 /* the following variable is used for caching string comparisons
2644 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
2645 int known_errorHandler = -1;
2646
2647 /* allocate enough for a simple encoding without
2648 replacements, if we need more, we'll resize */
Guido van Rossumf15a29f2007-05-04 00:41:39 +00002649 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002650 if (res == NULL)
2651 goto onError;
2652 if (size == 0)
2653 return res;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00002654 str = PyBytes_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002655 ressize = size;
2656
2657 while (p<endp) {
2658 Py_UNICODE c = *p;
2659
2660 /* can we encode this? */
2661 if (c<limit) {
2662 /* no overflow check, because we know that the space is enough */
2663 *str++ = (char)c;
2664 ++p;
2665 }
2666 else {
Martin v. Löwis18e16552006-02-15 17:27:45 +00002667 Py_ssize_t unicodepos = p-startp;
2668 Py_ssize_t requiredsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002669 PyObject *repunicode;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002670 Py_ssize_t repsize;
2671 Py_ssize_t newpos;
2672 Py_ssize_t respos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002673 Py_UNICODE *uni2;
2674 /* startpos for collecting unencodable chars */
2675 const Py_UNICODE *collstart = p;
2676 const Py_UNICODE *collend = p;
2677 /* find all unecodable characters */
2678 while ((collend < endp) && ((*collend)>=limit))
2679 ++collend;
2680 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
2681 if (known_errorHandler==-1) {
2682 if ((errors==NULL) || (!strcmp(errors, "strict")))
2683 known_errorHandler = 1;
2684 else if (!strcmp(errors, "replace"))
2685 known_errorHandler = 2;
2686 else if (!strcmp(errors, "ignore"))
2687 known_errorHandler = 3;
2688 else if (!strcmp(errors, "xmlcharrefreplace"))
2689 known_errorHandler = 4;
2690 else
2691 known_errorHandler = 0;
2692 }
2693 switch (known_errorHandler) {
2694 case 1: /* strict */
2695 raise_encode_exception(&exc, encoding, startp, size, collstart-startp, collend-startp, reason);
2696 goto onError;
2697 case 2: /* replace */
2698 while (collstart++<collend)
2699 *str++ = '?'; /* fall through */
2700 case 3: /* ignore */
2701 p = collend;
2702 break;
2703 case 4: /* xmlcharrefreplace */
Guido van Rossumf15a29f2007-05-04 00:41:39 +00002704 respos = str - PyBytes_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002705 /* determine replacement size (temporarily (mis)uses p) */
2706 for (p = collstart, repsize = 0; p < collend; ++p) {
2707 if (*p<10)
2708 repsize += 2+1+1;
2709 else if (*p<100)
2710 repsize += 2+2+1;
2711 else if (*p<1000)
2712 repsize += 2+3+1;
2713 else if (*p<10000)
2714 repsize += 2+4+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00002715#ifndef Py_UNICODE_WIDE
2716 else
2717 repsize += 2+5+1;
2718#else
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002719 else if (*p<100000)
2720 repsize += 2+5+1;
2721 else if (*p<1000000)
2722 repsize += 2+6+1;
2723 else
2724 repsize += 2+7+1;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002725#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002726 }
2727 requiredsize = respos+repsize+(endp-collend);
2728 if (requiredsize > ressize) {
2729 if (requiredsize<2*ressize)
2730 requiredsize = 2*ressize;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00002731 if (PyBytes_Resize(res, requiredsize))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002732 goto onError;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00002733 str = PyBytes_AS_STRING(res) + respos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002734 ressize = requiredsize;
2735 }
2736 /* generate replacement (temporarily (mis)uses p) */
2737 for (p = collstart; p < collend; ++p) {
2738 str += sprintf(str, "&#%d;", (int)*p);
2739 }
2740 p = collend;
2741 break;
2742 default:
2743 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
2744 encoding, reason, startp, size, &exc,
2745 collstart-startp, collend-startp, &newpos);
2746 if (repunicode == NULL)
2747 goto onError;
2748 /* need more space? (at least enough for what we
2749 have+the replacement+the rest of the string, so
2750 we won't have to check space for encodable characters) */
Guido van Rossumf15a29f2007-05-04 00:41:39 +00002751 respos = str - PyBytes_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002752 repsize = PyUnicode_GET_SIZE(repunicode);
2753 requiredsize = respos+repsize+(endp-collend);
2754 if (requiredsize > ressize) {
2755 if (requiredsize<2*ressize)
2756 requiredsize = 2*ressize;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00002757 if (PyBytes_Resize(res, requiredsize)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002758 Py_DECREF(repunicode);
2759 goto onError;
2760 }
Guido van Rossumf15a29f2007-05-04 00:41:39 +00002761 str = PyBytes_AS_STRING(res) + respos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002762 ressize = requiredsize;
2763 }
2764 /* check if there is anything unencodable in the replacement
2765 and copy it to the output */
2766 for (uni2 = PyUnicode_AS_UNICODE(repunicode);repsize-->0; ++uni2, ++str) {
2767 c = *uni2;
2768 if (c >= limit) {
2769 raise_encode_exception(&exc, encoding, startp, size,
2770 unicodepos, unicodepos+1, reason);
2771 Py_DECREF(repunicode);
2772 goto onError;
2773 }
2774 *str = (char)c;
2775 }
2776 p = startp + newpos;
2777 Py_DECREF(repunicode);
2778 }
2779 }
2780 }
2781 /* Resize if we allocated to much */
Guido van Rossumf15a29f2007-05-04 00:41:39 +00002782 respos = str - PyBytes_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002783 if (respos<ressize)
2784 /* If this falls res will be NULL */
Guido van Rossumf15a29f2007-05-04 00:41:39 +00002785 PyBytes_Resize(res, respos);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002786 Py_XDECREF(errorHandler);
2787 Py_XDECREF(exc);
2788 return res;
2789
2790 onError:
2791 Py_XDECREF(res);
2792 Py_XDECREF(errorHandler);
2793 Py_XDECREF(exc);
2794 return NULL;
2795}
2796
Guido van Rossumd57fd912000-03-10 22:53:23 +00002797PyObject *PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002798 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00002799 const char *errors)
2800{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002801 return unicode_encode_ucs1(p, size, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002802}
2803
2804PyObject *PyUnicode_AsLatin1String(PyObject *unicode)
2805{
2806 if (!PyUnicode_Check(unicode)) {
2807 PyErr_BadArgument();
2808 return NULL;
2809 }
2810 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
2811 PyUnicode_GET_SIZE(unicode),
2812 NULL);
2813}
2814
2815/* --- 7-bit ASCII Codec -------------------------------------------------- */
2816
Guido van Rossumd57fd912000-03-10 22:53:23 +00002817PyObject *PyUnicode_DecodeASCII(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002818 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00002819 const char *errors)
2820{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002821 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002822 PyUnicodeObject *v;
2823 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002824 Py_ssize_t startinpos;
2825 Py_ssize_t endinpos;
2826 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002827 const char *e;
2828 PyObject *errorHandler = NULL;
2829 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00002830
Guido van Rossumd57fd912000-03-10 22:53:23 +00002831 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002832 if (size == 1 && *(unsigned char*)s < 128) {
2833 Py_UNICODE r = *(unsigned char*)s;
2834 return PyUnicode_FromUnicode(&r, 1);
2835 }
Tim Petersced69f82003-09-16 20:30:58 +00002836
Guido van Rossumd57fd912000-03-10 22:53:23 +00002837 v = _PyUnicode_New(size);
2838 if (v == NULL)
2839 goto onError;
2840 if (size == 0)
2841 return (PyObject *)v;
2842 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002843 e = s + size;
2844 while (s < e) {
2845 register unsigned char c = (unsigned char)*s;
2846 if (c < 128) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002847 *p++ = c;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002848 ++s;
2849 }
2850 else {
2851 startinpos = s-starts;
2852 endinpos = startinpos + 1;
Jeremy Hyltond8082792003-09-16 19:41:39 +00002853 outpos = p - (Py_UNICODE *)PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002854 if (unicode_decode_call_errorhandler(
2855 errors, &errorHandler,
2856 "ascii", "ordinal not in range(128)",
2857 starts, size, &startinpos, &endinpos, &exc, &s,
2858 (PyObject **)&v, &outpos, &p))
Guido van Rossumd57fd912000-03-10 22:53:23 +00002859 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002860 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002861 }
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002862 if (p - PyUnicode_AS_UNICODE(v) < PyString_GET_SIZE(v))
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002863 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002864 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002865 Py_XDECREF(errorHandler);
2866 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002867 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00002868
Guido van Rossumd57fd912000-03-10 22:53:23 +00002869 onError:
2870 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002871 Py_XDECREF(errorHandler);
2872 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002873 return NULL;
2874}
2875
Guido van Rossumd57fd912000-03-10 22:53:23 +00002876PyObject *PyUnicode_EncodeASCII(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002877 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00002878 const char *errors)
2879{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002880 return unicode_encode_ucs1(p, size, errors, 128);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002881}
2882
2883PyObject *PyUnicode_AsASCIIString(PyObject *unicode)
2884{
2885 if (!PyUnicode_Check(unicode)) {
2886 PyErr_BadArgument();
2887 return NULL;
2888 }
2889 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
2890 PyUnicode_GET_SIZE(unicode),
2891 NULL);
2892}
2893
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00002894#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Guido van Rossum2ea3e142000-03-31 17:24:09 +00002895
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002896/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00002897
Thomas Wouters0e3f5912006-08-11 14:57:12 +00002898#if SIZEOF_INT < SIZEOF_SSIZE_T
2899#define NEED_RETRY
2900#endif
2901
2902/* XXX This code is limited to "true" double-byte encodings, as
2903 a) it assumes an incomplete character consists of a single byte, and
2904 b) IsDBCSLeadByte (probably) does not work for non-DBCS multi-byte
2905 encodings, see IsDBCSLeadByteEx documentation. */
2906
2907static int is_dbcs_lead_byte(const char *s, int offset)
2908{
2909 const char *curr = s + offset;
2910
2911 if (IsDBCSLeadByte(*curr)) {
2912 const char *prev = CharPrev(s, curr);
2913 return (prev == curr) || !IsDBCSLeadByte(*prev) || (curr - prev == 2);
2914 }
2915 return 0;
2916}
2917
2918/*
2919 * Decode MBCS string into unicode object. If 'final' is set, converts
2920 * trailing lead-byte too. Returns consumed size if succeed, -1 otherwise.
2921 */
2922static int decode_mbcs(PyUnicodeObject **v,
2923 const char *s, /* MBCS string */
2924 int size, /* sizeof MBCS string */
2925 int final)
2926{
2927 Py_UNICODE *p;
2928 Py_ssize_t n = 0;
2929 int usize = 0;
2930
2931 assert(size >= 0);
2932
2933 /* Skip trailing lead-byte unless 'final' is set */
2934 if (!final && size >= 1 && is_dbcs_lead_byte(s, size - 1))
2935 --size;
2936
2937 /* First get the size of the result */
2938 if (size > 0) {
2939 usize = MultiByteToWideChar(CP_ACP, 0, s, size, NULL, 0);
2940 if (usize == 0) {
2941 PyErr_SetFromWindowsErrWithFilename(0, NULL);
2942 return -1;
2943 }
2944 }
2945
2946 if (*v == NULL) {
2947 /* Create unicode object */
2948 *v = _PyUnicode_New(usize);
2949 if (*v == NULL)
2950 return -1;
2951 }
2952 else {
2953 /* Extend unicode object */
2954 n = PyUnicode_GET_SIZE(*v);
2955 if (_PyUnicode_Resize(v, n + usize) < 0)
2956 return -1;
2957 }
2958
2959 /* Do the conversion */
2960 if (size > 0) {
2961 p = PyUnicode_AS_UNICODE(*v) + n;
2962 if (0 == MultiByteToWideChar(CP_ACP, 0, s, size, p, usize)) {
2963 PyErr_SetFromWindowsErrWithFilename(0, NULL);
2964 return -1;
2965 }
2966 }
2967
2968 return size;
2969}
2970
2971PyObject *PyUnicode_DecodeMBCSStateful(const char *s,
2972 Py_ssize_t size,
2973 const char *errors,
2974 Py_ssize_t *consumed)
2975{
2976 PyUnicodeObject *v = NULL;
2977 int done;
2978
2979 if (consumed)
2980 *consumed = 0;
2981
2982#ifdef NEED_RETRY
2983 retry:
2984 if (size > INT_MAX)
2985 done = decode_mbcs(&v, s, INT_MAX, 0);
2986 else
2987#endif
2988 done = decode_mbcs(&v, s, (int)size, !consumed);
2989
2990 if (done < 0) {
2991 Py_XDECREF(v);
2992 return NULL;
2993 }
2994
2995 if (consumed)
2996 *consumed += done;
2997
2998#ifdef NEED_RETRY
2999 if (size > INT_MAX) {
3000 s += done;
3001 size -= done;
3002 goto retry;
3003 }
3004#endif
3005
3006 return (PyObject *)v;
3007}
3008
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003009PyObject *PyUnicode_DecodeMBCS(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003010 Py_ssize_t size,
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003011 const char *errors)
3012{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003013 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
3014}
3015
3016/*
3017 * Convert unicode into string object (MBCS).
3018 * Returns 0 if succeed, -1 otherwise.
3019 */
3020static int encode_mbcs(PyObject **repr,
3021 const Py_UNICODE *p, /* unicode */
3022 int size) /* size of unicode */
3023{
3024 int mbcssize = 0;
3025 Py_ssize_t n = 0;
3026
3027 assert(size >= 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003028
3029 /* First get the size of the result */
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003030 if (size > 0) {
3031 mbcssize = WideCharToMultiByte(CP_ACP, 0, p, size, NULL, 0, NULL, NULL);
3032 if (mbcssize == 0) {
3033 PyErr_SetFromWindowsErrWithFilename(0, NULL);
3034 return -1;
3035 }
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003036 }
3037
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003038 if (*repr == NULL) {
3039 /* Create string object */
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003040 *repr = PyBytes_FromStringAndSize(NULL, mbcssize);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003041 if (*repr == NULL)
3042 return -1;
3043 }
3044 else {
3045 /* Extend string object */
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003046 n = PyBytes_Size(*repr);
3047 if (PyBytes_Resize(*repr, n + mbcssize) < 0)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003048 return -1;
3049 }
3050
3051 /* Do the conversion */
3052 if (size > 0) {
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003053 char *s = PyBytes_AS_STRING(*repr) + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003054 if (0 == WideCharToMultiByte(CP_ACP, 0, p, size, s, mbcssize, NULL, NULL)) {
3055 PyErr_SetFromWindowsErrWithFilename(0, NULL);
3056 return -1;
3057 }
3058 }
3059
3060 return 0;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003061}
3062
3063PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003064 Py_ssize_t size,
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003065 const char *errors)
3066{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003067 PyObject *repr = NULL;
3068 int ret;
Guido van Rossum03e29f12000-05-04 15:52:20 +00003069
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003070#ifdef NEED_RETRY
3071 retry:
3072 if (size > INT_MAX)
3073 ret = encode_mbcs(&repr, p, INT_MAX);
3074 else
3075#endif
3076 ret = encode_mbcs(&repr, p, (int)size);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003077
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003078 if (ret < 0) {
3079 Py_XDECREF(repr);
3080 return NULL;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003081 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003082
3083#ifdef NEED_RETRY
3084 if (size > INT_MAX) {
3085 p += INT_MAX;
3086 size -= INT_MAX;
3087 goto retry;
3088 }
3089#endif
3090
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003091 return repr;
3092}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00003093
Mark Hammond0ccda1e2003-07-01 00:13:27 +00003094PyObject *PyUnicode_AsMBCSString(PyObject *unicode)
3095{
3096 if (!PyUnicode_Check(unicode)) {
3097 PyErr_BadArgument();
3098 return NULL;
3099 }
3100 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
3101 PyUnicode_GET_SIZE(unicode),
3102 NULL);
3103}
3104
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003105#undef NEED_RETRY
3106
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00003107#endif /* MS_WINDOWS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003108
Guido van Rossumd57fd912000-03-10 22:53:23 +00003109/* --- Character Mapping Codec -------------------------------------------- */
3110
Guido van Rossumd57fd912000-03-10 22:53:23 +00003111PyObject *PyUnicode_DecodeCharmap(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003112 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003113 PyObject *mapping,
3114 const char *errors)
3115{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003116 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003117 Py_ssize_t startinpos;
3118 Py_ssize_t endinpos;
3119 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003120 const char *e;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003121 PyUnicodeObject *v;
3122 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003123 Py_ssize_t extrachars = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003124 PyObject *errorHandler = NULL;
3125 PyObject *exc = NULL;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003126 Py_UNICODE *mapstring = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003127 Py_ssize_t maplen = 0;
Tim Petersced69f82003-09-16 20:30:58 +00003128
Guido van Rossumd57fd912000-03-10 22:53:23 +00003129 /* Default to Latin-1 */
3130 if (mapping == NULL)
3131 return PyUnicode_DecodeLatin1(s, size, errors);
3132
3133 v = _PyUnicode_New(size);
3134 if (v == NULL)
3135 goto onError;
3136 if (size == 0)
3137 return (PyObject *)v;
3138 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003139 e = s + size;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003140 if (PyUnicode_CheckExact(mapping)) {
3141 mapstring = PyUnicode_AS_UNICODE(mapping);
3142 maplen = PyUnicode_GET_SIZE(mapping);
3143 while (s < e) {
3144 unsigned char ch = *s;
3145 Py_UNICODE x = 0xfffe; /* illegal value */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003146
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003147 if (ch < maplen)
3148 x = mapstring[ch];
Guido van Rossumd57fd912000-03-10 22:53:23 +00003149
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003150 if (x == 0xfffe) {
3151 /* undefined mapping */
3152 outpos = p-PyUnicode_AS_UNICODE(v);
3153 startinpos = s-starts;
3154 endinpos = startinpos+1;
3155 if (unicode_decode_call_errorhandler(
3156 errors, &errorHandler,
3157 "charmap", "character maps to <undefined>",
3158 starts, size, &startinpos, &endinpos, &exc, &s,
3159 (PyObject **)&v, &outpos, &p)) {
3160 goto onError;
Marc-André Lemburgec233e52001-01-06 14:59:58 +00003161 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003162 continue;
Marc-André Lemburgec233e52001-01-06 14:59:58 +00003163 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003164 *p++ = x;
3165 ++s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003166 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003167 }
3168 else {
3169 while (s < e) {
3170 unsigned char ch = *s;
3171 PyObject *w, *x;
3172
3173 /* Get mapping (char ordinal -> integer, Unicode char or None) */
3174 w = PyInt_FromLong((long)ch);
3175 if (w == NULL)
3176 goto onError;
3177 x = PyObject_GetItem(mapping, w);
3178 Py_DECREF(w);
3179 if (x == NULL) {
3180 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
3181 /* No mapping found means: mapping is undefined. */
3182 PyErr_Clear();
3183 x = Py_None;
3184 Py_INCREF(x);
3185 } else
3186 goto onError;
3187 }
3188
3189 /* Apply mapping */
3190 if (PyInt_Check(x)) {
3191 long value = PyInt_AS_LONG(x);
3192 if (value < 0 || value > 65535) {
3193 PyErr_SetString(PyExc_TypeError,
3194 "character mapping must be in range(65536)");
3195 Py_DECREF(x);
3196 goto onError;
3197 }
3198 *p++ = (Py_UNICODE)value;
3199 }
3200 else if (x == Py_None) {
3201 /* undefined mapping */
3202 outpos = p-PyUnicode_AS_UNICODE(v);
3203 startinpos = s-starts;
3204 endinpos = startinpos+1;
3205 if (unicode_decode_call_errorhandler(
3206 errors, &errorHandler,
3207 "charmap", "character maps to <undefined>",
3208 starts, size, &startinpos, &endinpos, &exc, &s,
3209 (PyObject **)&v, &outpos, &p)) {
3210 Py_DECREF(x);
3211 goto onError;
3212 }
Walter Dörwaldd4fff172005-11-28 22:15:56 +00003213 Py_DECREF(x);
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003214 continue;
3215 }
3216 else if (PyUnicode_Check(x)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00003217 Py_ssize_t targetsize = PyUnicode_GET_SIZE(x);
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003218
3219 if (targetsize == 1)
3220 /* 1-1 mapping */
3221 *p++ = *PyUnicode_AS_UNICODE(x);
3222
3223 else if (targetsize > 1) {
3224 /* 1-n mapping */
3225 if (targetsize > extrachars) {
3226 /* resize first */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003227 Py_ssize_t oldpos = p - PyUnicode_AS_UNICODE(v);
3228 Py_ssize_t needed = (targetsize - extrachars) + \
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003229 (targetsize << 2);
3230 extrachars += needed;
Thomas Wouters89f507f2006-12-13 04:49:30 +00003231 /* XXX overflow detection missing */
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003232 if (_PyUnicode_Resize(&v,
3233 PyUnicode_GET_SIZE(v) + needed) < 0) {
3234 Py_DECREF(x);
3235 goto onError;
3236 }
3237 p = PyUnicode_AS_UNICODE(v) + oldpos;
3238 }
3239 Py_UNICODE_COPY(p,
3240 PyUnicode_AS_UNICODE(x),
3241 targetsize);
3242 p += targetsize;
3243 extrachars -= targetsize;
3244 }
3245 /* 1-0 mapping: skip the character */
3246 }
3247 else {
3248 /* wrong return value */
3249 PyErr_SetString(PyExc_TypeError,
3250 "character mapping must return integer, None or unicode");
3251 Py_DECREF(x);
3252 goto onError;
3253 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003254 Py_DECREF(x);
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003255 ++s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003256 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003257 }
3258 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003259 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003260 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003261 Py_XDECREF(errorHandler);
3262 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003263 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00003264
Guido van Rossumd57fd912000-03-10 22:53:23 +00003265 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003266 Py_XDECREF(errorHandler);
3267 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003268 Py_XDECREF(v);
3269 return NULL;
3270}
3271
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003272/* Charmap encoding: the lookup table */
3273
3274struct encoding_map{
3275 PyObject_HEAD
3276 unsigned char level1[32];
3277 int count2, count3;
3278 unsigned char level23[1];
3279};
3280
3281static PyObject*
3282encoding_map_size(PyObject *obj, PyObject* args)
3283{
3284 struct encoding_map *map = (struct encoding_map*)obj;
3285 return PyInt_FromLong(sizeof(*map) - 1 + 16*map->count2 +
3286 128*map->count3);
3287}
3288
3289static PyMethodDef encoding_map_methods[] = {
3290 {"size", encoding_map_size, METH_NOARGS,
3291 PyDoc_STR("Return the size (in bytes) of this object") },
3292 { 0 }
3293};
3294
3295static void
3296encoding_map_dealloc(PyObject* o)
3297{
3298 PyObject_FREE(o);
3299}
3300
3301static PyTypeObject EncodingMapType = {
3302 PyObject_HEAD_INIT(NULL)
3303 0, /*ob_size*/
3304 "EncodingMap", /*tp_name*/
3305 sizeof(struct encoding_map), /*tp_basicsize*/
3306 0, /*tp_itemsize*/
3307 /* methods */
3308 encoding_map_dealloc, /*tp_dealloc*/
3309 0, /*tp_print*/
3310 0, /*tp_getattr*/
3311 0, /*tp_setattr*/
3312 0, /*tp_compare*/
3313 0, /*tp_repr*/
3314 0, /*tp_as_number*/
3315 0, /*tp_as_sequence*/
3316 0, /*tp_as_mapping*/
3317 0, /*tp_hash*/
3318 0, /*tp_call*/
3319 0, /*tp_str*/
3320 0, /*tp_getattro*/
3321 0, /*tp_setattro*/
3322 0, /*tp_as_buffer*/
3323 Py_TPFLAGS_DEFAULT, /*tp_flags*/
3324 0, /*tp_doc*/
3325 0, /*tp_traverse*/
3326 0, /*tp_clear*/
3327 0, /*tp_richcompare*/
3328 0, /*tp_weaklistoffset*/
3329 0, /*tp_iter*/
3330 0, /*tp_iternext*/
3331 encoding_map_methods, /*tp_methods*/
3332 0, /*tp_members*/
3333 0, /*tp_getset*/
3334 0, /*tp_base*/
3335 0, /*tp_dict*/
3336 0, /*tp_descr_get*/
3337 0, /*tp_descr_set*/
3338 0, /*tp_dictoffset*/
3339 0, /*tp_init*/
3340 0, /*tp_alloc*/
3341 0, /*tp_new*/
3342 0, /*tp_free*/
3343 0, /*tp_is_gc*/
3344};
3345
3346PyObject*
3347PyUnicode_BuildEncodingMap(PyObject* string)
3348{
3349 Py_UNICODE *decode;
3350 PyObject *result;
3351 struct encoding_map *mresult;
3352 int i;
3353 int need_dict = 0;
3354 unsigned char level1[32];
3355 unsigned char level2[512];
3356 unsigned char *mlevel1, *mlevel2, *mlevel3;
3357 int count2 = 0, count3 = 0;
3358
3359 if (!PyUnicode_Check(string) || PyUnicode_GetSize(string) != 256) {
3360 PyErr_BadArgument();
3361 return NULL;
3362 }
3363 decode = PyUnicode_AS_UNICODE(string);
3364 memset(level1, 0xFF, sizeof level1);
3365 memset(level2, 0xFF, sizeof level2);
3366
3367 /* If there isn't a one-to-one mapping of NULL to \0,
3368 or if there are non-BMP characters, we need to use
3369 a mapping dictionary. */
3370 if (decode[0] != 0)
3371 need_dict = 1;
3372 for (i = 1; i < 256; i++) {
3373 int l1, l2;
3374 if (decode[i] == 0
3375 #ifdef Py_UNICODE_WIDE
3376 || decode[i] > 0xFFFF
3377 #endif
3378 ) {
3379 need_dict = 1;
3380 break;
3381 }
3382 if (decode[i] == 0xFFFE)
3383 /* unmapped character */
3384 continue;
3385 l1 = decode[i] >> 11;
3386 l2 = decode[i] >> 7;
3387 if (level1[l1] == 0xFF)
3388 level1[l1] = count2++;
3389 if (level2[l2] == 0xFF)
3390 level2[l2] = count3++;
3391 }
3392
3393 if (count2 >= 0xFF || count3 >= 0xFF)
3394 need_dict = 1;
3395
3396 if (need_dict) {
3397 PyObject *result = PyDict_New();
3398 PyObject *key, *value;
3399 if (!result)
3400 return NULL;
3401 for (i = 0; i < 256; i++) {
3402 key = value = NULL;
3403 key = PyInt_FromLong(decode[i]);
3404 value = PyInt_FromLong(i);
3405 if (!key || !value)
3406 goto failed1;
3407 if (PyDict_SetItem(result, key, value) == -1)
3408 goto failed1;
3409 Py_DECREF(key);
3410 Py_DECREF(value);
3411 }
3412 return result;
3413 failed1:
3414 Py_XDECREF(key);
3415 Py_XDECREF(value);
3416 Py_DECREF(result);
3417 return NULL;
3418 }
3419
3420 /* Create a three-level trie */
3421 result = PyObject_MALLOC(sizeof(struct encoding_map) +
3422 16*count2 + 128*count3 - 1);
3423 if (!result)
3424 return PyErr_NoMemory();
3425 PyObject_Init(result, &EncodingMapType);
3426 mresult = (struct encoding_map*)result;
3427 mresult->count2 = count2;
3428 mresult->count3 = count3;
3429 mlevel1 = mresult->level1;
3430 mlevel2 = mresult->level23;
3431 mlevel3 = mresult->level23 + 16*count2;
3432 memcpy(mlevel1, level1, 32);
3433 memset(mlevel2, 0xFF, 16*count2);
3434 memset(mlevel3, 0, 128*count3);
3435 count3 = 0;
3436 for (i = 1; i < 256; i++) {
3437 int o1, o2, o3, i2, i3;
3438 if (decode[i] == 0xFFFE)
3439 /* unmapped character */
3440 continue;
3441 o1 = decode[i]>>11;
3442 o2 = (decode[i]>>7) & 0xF;
3443 i2 = 16*mlevel1[o1] + o2;
3444 if (mlevel2[i2] == 0xFF)
3445 mlevel2[i2] = count3++;
3446 o3 = decode[i] & 0x7F;
3447 i3 = 128*mlevel2[i2] + o3;
3448 mlevel3[i3] = i;
3449 }
3450 return result;
3451}
3452
3453static int
3454encoding_map_lookup(Py_UNICODE c, PyObject *mapping)
3455{
3456 struct encoding_map *map = (struct encoding_map*)mapping;
3457 int l1 = c>>11;
3458 int l2 = (c>>7) & 0xF;
3459 int l3 = c & 0x7F;
3460 int i;
3461
3462#ifdef Py_UNICODE_WIDE
3463 if (c > 0xFFFF) {
3464 return -1;
3465 }
3466#endif
3467 if (c == 0)
3468 return 0;
3469 /* level 1*/
3470 i = map->level1[l1];
3471 if (i == 0xFF) {
3472 return -1;
3473 }
3474 /* level 2*/
3475 i = map->level23[16*i+l2];
3476 if (i == 0xFF) {
3477 return -1;
3478 }
3479 /* level 3 */
3480 i = map->level23[16*map->count2 + 128*i + l3];
3481 if (i == 0) {
3482 return -1;
3483 }
3484 return i;
3485}
3486
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003487/* Lookup the character ch in the mapping. If the character
3488 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00003489 error occurred). */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003490static PyObject *charmapencode_lookup(Py_UNICODE c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003491{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003492 PyObject *w = PyInt_FromLong((long)c);
3493 PyObject *x;
3494
3495 if (w == NULL)
3496 return NULL;
3497 x = PyObject_GetItem(mapping, w);
3498 Py_DECREF(w);
3499 if (x == NULL) {
3500 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
3501 /* No mapping found means: mapping is undefined. */
3502 PyErr_Clear();
3503 x = Py_None;
3504 Py_INCREF(x);
3505 return x;
3506 } else
3507 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003508 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00003509 else if (x == Py_None)
3510 return x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003511 else if (PyInt_Check(x)) {
3512 long value = PyInt_AS_LONG(x);
3513 if (value < 0 || value > 255) {
3514 PyErr_SetString(PyExc_TypeError,
3515 "character mapping must be in range(256)");
3516 Py_DECREF(x);
3517 return NULL;
3518 }
3519 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003520 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003521 else if (PyString_Check(x))
3522 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003523 else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003524 /* wrong return value */
3525 PyErr_SetString(PyExc_TypeError,
3526 "character mapping must return integer, None or str");
3527 Py_DECREF(x);
3528 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003529 }
3530}
3531
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003532static int
3533charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
3534{
3535 Py_ssize_t outsize = PyString_GET_SIZE(*outobj);
3536 /* exponentially overallocate to minimize reallocations */
3537 if (requiredsize < 2*outsize)
3538 requiredsize = 2*outsize;
3539 if (_PyString_Resize(outobj, requiredsize)) {
3540 return 0;
3541 }
3542 return 1;
3543}
3544
3545typedef enum charmapencode_result {
3546 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
3547}charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003548/* lookup the character, put the result in the output string and adjust
3549 various state variables. Reallocate the output string if not enough
3550 space is available. Return a new reference to the object that
3551 was put in the output buffer, or Py_None, if the mapping was undefined
3552 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00003553 reallocation error occurred. The caller must decref the result */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003554static
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003555charmapencode_result charmapencode_output(Py_UNICODE c, PyObject *mapping,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003556 PyObject **outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003557{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003558 PyObject *rep;
3559 char *outstart;
3560 Py_ssize_t outsize = PyString_GET_SIZE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003561
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003562 if (mapping->ob_type == &EncodingMapType) {
3563 int res = encoding_map_lookup(c, mapping);
3564 Py_ssize_t requiredsize = *outpos+1;
3565 if (res == -1)
3566 return enc_FAILED;
3567 if (outsize<requiredsize)
3568 if (!charmapencode_resize(outobj, outpos, requiredsize))
3569 return enc_EXCEPTION;
3570 outstart = PyString_AS_STRING(*outobj);
3571 outstart[(*outpos)++] = (char)res;
3572 return enc_SUCCESS;
3573 }
3574
3575 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003576 if (rep==NULL)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003577 return enc_EXCEPTION;
3578 else if (rep==Py_None) {
3579 Py_DECREF(rep);
3580 return enc_FAILED;
3581 } else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003582 if (PyInt_Check(rep)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00003583 Py_ssize_t requiredsize = *outpos+1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003584 if (outsize<requiredsize)
3585 if (!charmapencode_resize(outobj, outpos, requiredsize)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003586 Py_DECREF(rep);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003587 return enc_EXCEPTION;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003588 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003589 outstart = PyString_AS_STRING(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003590 outstart[(*outpos)++] = (char)PyInt_AS_LONG(rep);
3591 }
3592 else {
3593 const char *repchars = PyString_AS_STRING(rep);
Martin v. Löwis18e16552006-02-15 17:27:45 +00003594 Py_ssize_t repsize = PyString_GET_SIZE(rep);
3595 Py_ssize_t requiredsize = *outpos+repsize;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003596 if (outsize<requiredsize)
3597 if (!charmapencode_resize(outobj, outpos, requiredsize)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003598 Py_DECREF(rep);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003599 return enc_EXCEPTION;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003600 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003601 outstart = PyString_AS_STRING(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003602 memcpy(outstart + *outpos, repchars, repsize);
3603 *outpos += repsize;
3604 }
3605 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003606 Py_DECREF(rep);
3607 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003608}
3609
3610/* handle an error in PyUnicode_EncodeCharmap
3611 Return 0 on success, -1 on error */
3612static
3613int charmap_encoding_error(
Martin v. Löwis18e16552006-02-15 17:27:45 +00003614 const Py_UNICODE *p, Py_ssize_t size, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003615 PyObject **exceptionObject,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00003616 int *known_errorHandler, PyObject **errorHandler, const char *errors,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003617 PyObject **res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003618{
3619 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003620 Py_ssize_t repsize;
3621 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003622 Py_UNICODE *uni2;
3623 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003624 Py_ssize_t collstartpos = *inpos;
3625 Py_ssize_t collendpos = *inpos+1;
3626 Py_ssize_t collpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003627 char *encoding = "charmap";
3628 char *reason = "character maps to <undefined>";
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003629 charmapencode_result x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003630
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003631 /* find all unencodable characters */
3632 while (collendpos < size) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003633 PyObject *rep;
3634 if (mapping->ob_type == &EncodingMapType) {
3635 int res = encoding_map_lookup(p[collendpos], mapping);
3636 if (res != -1)
3637 break;
3638 ++collendpos;
3639 continue;
3640 }
3641
3642 rep = charmapencode_lookup(p[collendpos], mapping);
3643 if (rep==NULL)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003644 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003645 else if (rep!=Py_None) {
3646 Py_DECREF(rep);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003647 break;
3648 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003649 Py_DECREF(rep);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003650 ++collendpos;
3651 }
3652 /* cache callback name lookup
3653 * (if not done yet, i.e. it's the first error) */
3654 if (*known_errorHandler==-1) {
3655 if ((errors==NULL) || (!strcmp(errors, "strict")))
3656 *known_errorHandler = 1;
3657 else if (!strcmp(errors, "replace"))
3658 *known_errorHandler = 2;
3659 else if (!strcmp(errors, "ignore"))
3660 *known_errorHandler = 3;
3661 else if (!strcmp(errors, "xmlcharrefreplace"))
3662 *known_errorHandler = 4;
3663 else
3664 *known_errorHandler = 0;
3665 }
3666 switch (*known_errorHandler) {
3667 case 1: /* strict */
3668 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
3669 return -1;
3670 case 2: /* replace */
3671 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
3672 x = charmapencode_output('?', mapping, res, respos);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003673 if (x==enc_EXCEPTION) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003674 return -1;
3675 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003676 else if (x==enc_FAILED) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003677 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
3678 return -1;
3679 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003680 }
3681 /* fall through */
3682 case 3: /* ignore */
3683 *inpos = collendpos;
3684 break;
3685 case 4: /* xmlcharrefreplace */
3686 /* generate replacement (temporarily (mis)uses p) */
3687 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
3688 char buffer[2+29+1+1];
3689 char *cp;
3690 sprintf(buffer, "&#%d;", (int)p[collpos]);
3691 for (cp = buffer; *cp; ++cp) {
3692 x = charmapencode_output(*cp, mapping, res, respos);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003693 if (x==enc_EXCEPTION)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003694 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003695 else if (x==enc_FAILED) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003696 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
3697 return -1;
3698 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003699 }
3700 }
3701 *inpos = collendpos;
3702 break;
3703 default:
Walter Dörwalde5402fb2003-08-14 20:25:29 +00003704 repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003705 encoding, reason, p, size, exceptionObject,
3706 collstartpos, collendpos, &newpos);
3707 if (repunicode == NULL)
3708 return -1;
3709 /* generate replacement */
3710 repsize = PyUnicode_GET_SIZE(repunicode);
3711 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
3712 x = charmapencode_output(*uni2, mapping, res, respos);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003713 if (x==enc_EXCEPTION) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003714 return -1;
3715 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003716 else if (x==enc_FAILED) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003717 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003718 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
3719 return -1;
3720 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003721 }
3722 *inpos = newpos;
3723 Py_DECREF(repunicode);
3724 }
3725 return 0;
3726}
3727
Guido van Rossumd57fd912000-03-10 22:53:23 +00003728PyObject *PyUnicode_EncodeCharmap(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003729 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003730 PyObject *mapping,
3731 const char *errors)
3732{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003733 /* output object */
3734 PyObject *res = NULL;
3735 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003736 Py_ssize_t inpos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003737 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003738 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003739 PyObject *errorHandler = NULL;
3740 PyObject *exc = NULL;
3741 /* the following variable is used for caching string comparisons
3742 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
3743 * 3=ignore, 4=xmlcharrefreplace */
3744 int known_errorHandler = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003745
3746 /* Default to Latin-1 */
3747 if (mapping == NULL)
3748 return PyUnicode_EncodeLatin1(p, size, errors);
3749
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003750 /* allocate enough for a simple encoding without
3751 replacements, if we need more, we'll resize */
3752 res = PyString_FromStringAndSize(NULL, size);
3753 if (res == NULL)
3754 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00003755 if (size == 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003756 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003757
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003758 while (inpos<size) {
3759 /* try to encode it */
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003760 charmapencode_result x = charmapencode_output(p[inpos], mapping, &res, &respos);
3761 if (x==enc_EXCEPTION) /* error */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003762 goto onError;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003763 if (x==enc_FAILED) { /* unencodable character */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003764 if (charmap_encoding_error(p, size, &inpos, mapping,
3765 &exc,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00003766 &known_errorHandler, &errorHandler, errors,
Walter Dörwald9b30f202003-08-15 16:26:34 +00003767 &res, &respos)) {
Marc-André Lemburg3a645e42001-01-16 11:54:12 +00003768 goto onError;
Walter Dörwald9b30f202003-08-15 16:26:34 +00003769 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003770 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003771 else
3772 /* done with this character => adjust input position */
3773 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003774 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003775
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003776 /* Resize if we allocated to much */
3777 if (respos<PyString_GET_SIZE(res)) {
3778 if (_PyString_Resize(&res, respos))
3779 goto onError;
3780 }
3781 Py_XDECREF(exc);
3782 Py_XDECREF(errorHandler);
3783 return res;
3784
3785 onError:
3786 Py_XDECREF(res);
3787 Py_XDECREF(exc);
3788 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003789 return NULL;
3790}
3791
3792PyObject *PyUnicode_AsCharmapString(PyObject *unicode,
3793 PyObject *mapping)
3794{
3795 if (!PyUnicode_Check(unicode) || mapping == NULL) {
3796 PyErr_BadArgument();
3797 return NULL;
3798 }
3799 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
3800 PyUnicode_GET_SIZE(unicode),
3801 mapping,
3802 NULL);
3803}
3804
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003805/* create or adjust a UnicodeTranslateError */
3806static void make_translate_exception(PyObject **exceptionObject,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003807 const Py_UNICODE *unicode, Py_ssize_t size,
3808 Py_ssize_t startpos, Py_ssize_t endpos,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003809 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003810{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003811 if (*exceptionObject == NULL) {
3812 *exceptionObject = PyUnicodeTranslateError_Create(
3813 unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003814 }
3815 else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003816 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
3817 goto onError;
3818 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
3819 goto onError;
3820 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
3821 goto onError;
3822 return;
3823 onError:
3824 Py_DECREF(*exceptionObject);
3825 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003826 }
3827}
3828
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003829/* raises a UnicodeTranslateError */
3830static void raise_translate_exception(PyObject **exceptionObject,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003831 const Py_UNICODE *unicode, Py_ssize_t size,
3832 Py_ssize_t startpos, Py_ssize_t endpos,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003833 const char *reason)
3834{
3835 make_translate_exception(exceptionObject,
3836 unicode, size, startpos, endpos, reason);
3837 if (*exceptionObject != NULL)
3838 PyCodec_StrictErrors(*exceptionObject);
3839}
3840
3841/* error handling callback helper:
3842 build arguments, call the callback and check the arguments,
3843 put the result into newpos and return the replacement string, which
3844 has to be freed by the caller */
3845static PyObject *unicode_translate_call_errorhandler(const char *errors,
3846 PyObject **errorHandler,
3847 const char *reason,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003848 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
3849 Py_ssize_t startpos, Py_ssize_t endpos,
3850 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003851{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003852 static char *argparse = "O!n;translating error handler must return (unicode, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003853
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003854 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003855 PyObject *restuple;
3856 PyObject *resunicode;
3857
3858 if (*errorHandler == NULL) {
3859 *errorHandler = PyCodec_LookupError(errors);
3860 if (*errorHandler == NULL)
3861 return NULL;
3862 }
3863
3864 make_translate_exception(exceptionObject,
3865 unicode, size, startpos, endpos, reason);
3866 if (*exceptionObject == NULL)
3867 return NULL;
3868
3869 restuple = PyObject_CallFunctionObjArgs(
3870 *errorHandler, *exceptionObject, NULL);
3871 if (restuple == NULL)
3872 return NULL;
3873 if (!PyTuple_Check(restuple)) {
3874 PyErr_Format(PyExc_TypeError, &argparse[4]);
3875 Py_DECREF(restuple);
3876 return NULL;
3877 }
3878 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003879 &resunicode, &i_newpos)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003880 Py_DECREF(restuple);
3881 return NULL;
3882 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00003883 if (i_newpos<0)
3884 *newpos = size+i_newpos;
3885 else
3886 *newpos = i_newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003887 if (*newpos<0 || *newpos>size) {
Martin v. Löwis2c95cc62006-02-16 06:54:25 +00003888 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003889 Py_DECREF(restuple);
3890 return NULL;
3891 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003892 Py_INCREF(resunicode);
3893 Py_DECREF(restuple);
3894 return resunicode;
3895}
3896
3897/* Lookup the character ch in the mapping and put the result in result,
3898 which must be decrefed by the caller.
3899 Return 0 on success, -1 on error */
3900static
3901int charmaptranslate_lookup(Py_UNICODE c, PyObject *mapping, PyObject **result)
3902{
3903 PyObject *w = PyInt_FromLong((long)c);
3904 PyObject *x;
3905
3906 if (w == NULL)
3907 return -1;
3908 x = PyObject_GetItem(mapping, w);
3909 Py_DECREF(w);
3910 if (x == NULL) {
3911 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
3912 /* No mapping found means: use 1:1 mapping. */
3913 PyErr_Clear();
3914 *result = NULL;
3915 return 0;
3916 } else
3917 return -1;
3918 }
3919 else if (x == Py_None) {
3920 *result = x;
3921 return 0;
3922 }
3923 else if (PyInt_Check(x)) {
3924 long value = PyInt_AS_LONG(x);
3925 long max = PyUnicode_GetMax();
3926 if (value < 0 || value > max) {
3927 PyErr_Format(PyExc_TypeError,
3928 "character mapping must be in range(0x%lx)", max+1);
3929 Py_DECREF(x);
3930 return -1;
3931 }
3932 *result = x;
3933 return 0;
3934 }
3935 else if (PyUnicode_Check(x)) {
3936 *result = x;
3937 return 0;
3938 }
3939 else {
3940 /* wrong return value */
3941 PyErr_SetString(PyExc_TypeError,
3942 "character mapping must return integer, None or unicode");
Walter Dörwald150523e2003-08-15 16:52:19 +00003943 Py_DECREF(x);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003944 return -1;
3945 }
3946}
3947/* ensure that *outobj is at least requiredsize characters long,
3948if not reallocate and adjust various state variables.
3949Return 0 on success, -1 on error */
3950static
Walter Dörwald4894c302003-10-24 14:25:28 +00003951int charmaptranslate_makespace(PyObject **outobj, Py_UNICODE **outp,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003952 Py_ssize_t requiredsize)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003953{
Martin v. Löwis18e16552006-02-15 17:27:45 +00003954 Py_ssize_t oldsize = PyUnicode_GET_SIZE(*outobj);
Walter Dörwald4894c302003-10-24 14:25:28 +00003955 if (requiredsize > oldsize) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003956 /* remember old output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003957 Py_ssize_t outpos = *outp-PyUnicode_AS_UNICODE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003958 /* exponentially overallocate to minimize reallocations */
Walter Dörwald4894c302003-10-24 14:25:28 +00003959 if (requiredsize < 2 * oldsize)
3960 requiredsize = 2 * oldsize;
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00003961 if (_PyUnicode_Resize(outobj, requiredsize) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003962 return -1;
3963 *outp = PyUnicode_AS_UNICODE(*outobj) + outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003964 }
3965 return 0;
3966}
3967/* lookup the character, put the result in the output string and adjust
3968 various state variables. Return a new reference to the object that
3969 was put in the output buffer in *result, or Py_None, if the mapping was
3970 undefined (in which case no character was written).
3971 The called must decref result.
3972 Return 0 on success, -1 on error. */
3973static
Walter Dörwald4894c302003-10-24 14:25:28 +00003974int charmaptranslate_output(const Py_UNICODE *startinp, const Py_UNICODE *curinp,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003975 Py_ssize_t insize, PyObject *mapping, PyObject **outobj, Py_UNICODE **outp,
Walter Dörwald4894c302003-10-24 14:25:28 +00003976 PyObject **res)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003977{
Walter Dörwald4894c302003-10-24 14:25:28 +00003978 if (charmaptranslate_lookup(*curinp, mapping, res))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003979 return -1;
3980 if (*res==NULL) {
3981 /* not found => default to 1:1 mapping */
Walter Dörwald4894c302003-10-24 14:25:28 +00003982 *(*outp)++ = *curinp;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003983 }
3984 else if (*res==Py_None)
3985 ;
3986 else if (PyInt_Check(*res)) {
3987 /* no overflow check, because we know that the space is enough */
3988 *(*outp)++ = (Py_UNICODE)PyInt_AS_LONG(*res);
3989 }
3990 else if (PyUnicode_Check(*res)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00003991 Py_ssize_t repsize = PyUnicode_GET_SIZE(*res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003992 if (repsize==1) {
3993 /* no overflow check, because we know that the space is enough */
3994 *(*outp)++ = *PyUnicode_AS_UNICODE(*res);
3995 }
3996 else if (repsize!=0) {
3997 /* more than one character */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003998 Py_ssize_t requiredsize = (*outp-PyUnicode_AS_UNICODE(*outobj)) +
Walter Dörwaldcd736e72004-02-05 17:36:00 +00003999 (insize - (curinp-startinp)) +
Walter Dörwald4894c302003-10-24 14:25:28 +00004000 repsize - 1;
4001 if (charmaptranslate_makespace(outobj, outp, requiredsize))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004002 return -1;
4003 memcpy(*outp, PyUnicode_AS_UNICODE(*res), sizeof(Py_UNICODE)*repsize);
4004 *outp += repsize;
4005 }
4006 }
4007 else
4008 return -1;
4009 return 0;
4010}
4011
4012PyObject *PyUnicode_TranslateCharmap(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004013 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004014 PyObject *mapping,
4015 const char *errors)
4016{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004017 /* output object */
4018 PyObject *res = NULL;
4019 /* pointers to the beginning and end+1 of input */
4020 const Py_UNICODE *startp = p;
4021 const Py_UNICODE *endp = p + size;
4022 /* pointer into the output */
4023 Py_UNICODE *str;
4024 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004025 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004026 char *reason = "character maps to <undefined>";
4027 PyObject *errorHandler = NULL;
4028 PyObject *exc = NULL;
4029 /* the following variable is used for caching string comparisons
4030 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
4031 * 3=ignore, 4=xmlcharrefreplace */
4032 int known_errorHandler = -1;
4033
Guido van Rossumd57fd912000-03-10 22:53:23 +00004034 if (mapping == NULL) {
4035 PyErr_BadArgument();
4036 return NULL;
4037 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004038
4039 /* allocate enough for a simple 1:1 translation without
4040 replacements, if we need more, we'll resize */
4041 res = PyUnicode_FromUnicode(NULL, size);
4042 if (res == NULL)
Walter Dörwald4894c302003-10-24 14:25:28 +00004043 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004044 if (size == 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004045 return res;
4046 str = PyUnicode_AS_UNICODE(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004047
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004048 while (p<endp) {
4049 /* try to encode it */
4050 PyObject *x = NULL;
Walter Dörwald4894c302003-10-24 14:25:28 +00004051 if (charmaptranslate_output(startp, p, size, mapping, &res, &str, &x)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004052 Py_XDECREF(x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004053 goto onError;
4054 }
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00004055 Py_XDECREF(x);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004056 if (x!=Py_None) /* it worked => adjust input pointer */
4057 ++p;
4058 else { /* untranslatable character */
4059 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004060 Py_ssize_t repsize;
4061 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004062 Py_UNICODE *uni2;
4063 /* startpos for collecting untranslatable chars */
4064 const Py_UNICODE *collstart = p;
4065 const Py_UNICODE *collend = p+1;
4066 const Py_UNICODE *coll;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004067
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004068 /* find all untranslatable characters */
4069 while (collend < endp) {
Walter Dörwald4894c302003-10-24 14:25:28 +00004070 if (charmaptranslate_lookup(*collend, mapping, &x))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004071 goto onError;
4072 Py_XDECREF(x);
4073 if (x!=Py_None)
4074 break;
4075 ++collend;
4076 }
4077 /* cache callback name lookup
4078 * (if not done yet, i.e. it's the first error) */
4079 if (known_errorHandler==-1) {
4080 if ((errors==NULL) || (!strcmp(errors, "strict")))
4081 known_errorHandler = 1;
4082 else if (!strcmp(errors, "replace"))
4083 known_errorHandler = 2;
4084 else if (!strcmp(errors, "ignore"))
4085 known_errorHandler = 3;
4086 else if (!strcmp(errors, "xmlcharrefreplace"))
4087 known_errorHandler = 4;
4088 else
4089 known_errorHandler = 0;
4090 }
4091 switch (known_errorHandler) {
4092 case 1: /* strict */
4093 raise_translate_exception(&exc, startp, size, collstart-startp, collend-startp, reason);
4094 goto onError;
4095 case 2: /* replace */
4096 /* No need to check for space, this is a 1:1 replacement */
4097 for (coll = collstart; coll<collend; ++coll)
4098 *str++ = '?';
4099 /* fall through */
4100 case 3: /* ignore */
4101 p = collend;
4102 break;
4103 case 4: /* xmlcharrefreplace */
4104 /* generate replacement (temporarily (mis)uses p) */
4105 for (p = collstart; p < collend; ++p) {
4106 char buffer[2+29+1+1];
4107 char *cp;
4108 sprintf(buffer, "&#%d;", (int)*p);
Walter Dörwald4894c302003-10-24 14:25:28 +00004109 if (charmaptranslate_makespace(&res, &str,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004110 (str-PyUnicode_AS_UNICODE(res))+strlen(buffer)+(endp-collend)))
4111 goto onError;
4112 for (cp = buffer; *cp; ++cp)
4113 *str++ = *cp;
4114 }
4115 p = collend;
4116 break;
4117 default:
4118 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
4119 reason, startp, size, &exc,
4120 collstart-startp, collend-startp, &newpos);
4121 if (repunicode == NULL)
4122 goto onError;
4123 /* generate replacement */
4124 repsize = PyUnicode_GET_SIZE(repunicode);
Walter Dörwald4894c302003-10-24 14:25:28 +00004125 if (charmaptranslate_makespace(&res, &str,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004126 (str-PyUnicode_AS_UNICODE(res))+repsize+(endp-collend))) {
4127 Py_DECREF(repunicode);
4128 goto onError;
4129 }
4130 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2)
4131 *str++ = *uni2;
4132 p = startp + newpos;
4133 Py_DECREF(repunicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004134 }
4135 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004136 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004137 /* Resize if we allocated to much */
4138 respos = str-PyUnicode_AS_UNICODE(res);
Walter Dörwald4894c302003-10-24 14:25:28 +00004139 if (respos<PyUnicode_GET_SIZE(res)) {
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00004140 if (_PyUnicode_Resize(&res, respos) < 0)
Guido van Rossumfd4b9572000-04-10 13:51:10 +00004141 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004142 }
4143 Py_XDECREF(exc);
4144 Py_XDECREF(errorHandler);
4145 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004146
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004147 onError:
4148 Py_XDECREF(res);
4149 Py_XDECREF(exc);
4150 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004151 return NULL;
4152}
4153
4154PyObject *PyUnicode_Translate(PyObject *str,
4155 PyObject *mapping,
4156 const char *errors)
4157{
4158 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00004159
Guido van Rossumd57fd912000-03-10 22:53:23 +00004160 str = PyUnicode_FromObject(str);
4161 if (str == NULL)
4162 goto onError;
4163 result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str),
4164 PyUnicode_GET_SIZE(str),
4165 mapping,
4166 errors);
4167 Py_DECREF(str);
4168 return result;
Tim Petersced69f82003-09-16 20:30:58 +00004169
Guido van Rossumd57fd912000-03-10 22:53:23 +00004170 onError:
4171 Py_XDECREF(str);
4172 return NULL;
4173}
Tim Petersced69f82003-09-16 20:30:58 +00004174
Guido van Rossum9e896b32000-04-05 20:11:21 +00004175/* --- Decimal Encoder ---------------------------------------------------- */
4176
4177int PyUnicode_EncodeDecimal(Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004178 Py_ssize_t length,
Guido van Rossum9e896b32000-04-05 20:11:21 +00004179 char *output,
4180 const char *errors)
4181{
4182 Py_UNICODE *p, *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004183 PyObject *errorHandler = NULL;
4184 PyObject *exc = NULL;
4185 const char *encoding = "decimal";
4186 const char *reason = "invalid decimal Unicode string";
4187 /* the following variable is used for caching string comparisons
4188 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
4189 int known_errorHandler = -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00004190
4191 if (output == NULL) {
4192 PyErr_BadArgument();
4193 return -1;
4194 }
4195
4196 p = s;
4197 end = s + length;
4198 while (p < end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004199 register Py_UNICODE ch = *p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00004200 int decimal;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004201 PyObject *repunicode;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004202 Py_ssize_t repsize;
4203 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004204 Py_UNICODE *uni2;
4205 Py_UNICODE *collstart;
4206 Py_UNICODE *collend;
Tim Petersced69f82003-09-16 20:30:58 +00004207
Guido van Rossum9e896b32000-04-05 20:11:21 +00004208 if (Py_UNICODE_ISSPACE(ch)) {
4209 *output++ = ' ';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004210 ++p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00004211 continue;
4212 }
4213 decimal = Py_UNICODE_TODECIMAL(ch);
4214 if (decimal >= 0) {
4215 *output++ = '0' + decimal;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004216 ++p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00004217 continue;
4218 }
Guido van Rossumba477042000-04-06 18:18:10 +00004219 if (0 < ch && ch < 256) {
Guido van Rossum42c29aa2000-05-03 23:58:29 +00004220 *output++ = (char)ch;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004221 ++p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00004222 continue;
4223 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004224 /* All other characters are considered unencodable */
4225 collstart = p;
4226 collend = p+1;
4227 while (collend < end) {
4228 if ((0 < *collend && *collend < 256) ||
4229 !Py_UNICODE_ISSPACE(*collend) ||
4230 Py_UNICODE_TODECIMAL(*collend))
4231 break;
Guido van Rossum9e896b32000-04-05 20:11:21 +00004232 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004233 /* cache callback name lookup
4234 * (if not done yet, i.e. it's the first error) */
4235 if (known_errorHandler==-1) {
4236 if ((errors==NULL) || (!strcmp(errors, "strict")))
4237 known_errorHandler = 1;
4238 else if (!strcmp(errors, "replace"))
4239 known_errorHandler = 2;
4240 else if (!strcmp(errors, "ignore"))
4241 known_errorHandler = 3;
4242 else if (!strcmp(errors, "xmlcharrefreplace"))
4243 known_errorHandler = 4;
4244 else
4245 known_errorHandler = 0;
4246 }
4247 switch (known_errorHandler) {
4248 case 1: /* strict */
4249 raise_encode_exception(&exc, encoding, s, length, collstart-s, collend-s, reason);
4250 goto onError;
4251 case 2: /* replace */
4252 for (p = collstart; p < collend; ++p)
4253 *output++ = '?';
4254 /* fall through */
4255 case 3: /* ignore */
4256 p = collend;
4257 break;
4258 case 4: /* xmlcharrefreplace */
4259 /* generate replacement (temporarily (mis)uses p) */
4260 for (p = collstart; p < collend; ++p)
4261 output += sprintf(output, "&#%d;", (int)*p);
4262 p = collend;
4263 break;
4264 default:
4265 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
4266 encoding, reason, s, length, &exc,
4267 collstart-s, collend-s, &newpos);
4268 if (repunicode == NULL)
4269 goto onError;
4270 /* generate replacement */
4271 repsize = PyUnicode_GET_SIZE(repunicode);
4272 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
4273 Py_UNICODE ch = *uni2;
4274 if (Py_UNICODE_ISSPACE(ch))
4275 *output++ = ' ';
4276 else {
4277 decimal = Py_UNICODE_TODECIMAL(ch);
4278 if (decimal >= 0)
4279 *output++ = '0' + decimal;
4280 else if (0 < ch && ch < 256)
4281 *output++ = (char)ch;
4282 else {
4283 Py_DECREF(repunicode);
4284 raise_encode_exception(&exc, encoding,
4285 s, length, collstart-s, collend-s, reason);
4286 goto onError;
4287 }
4288 }
4289 }
4290 p = s + newpos;
4291 Py_DECREF(repunicode);
Guido van Rossum9e896b32000-04-05 20:11:21 +00004292 }
4293 }
4294 /* 0-terminate the output string */
4295 *output++ = '\0';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004296 Py_XDECREF(exc);
4297 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00004298 return 0;
4299
4300 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004301 Py_XDECREF(exc);
4302 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00004303 return -1;
4304}
4305
Guido van Rossumd57fd912000-03-10 22:53:23 +00004306/* --- Helpers ------------------------------------------------------------ */
4307
Thomas Wouters477c8d52006-05-27 19:21:47 +00004308#define STRINGLIB_CHAR Py_UNICODE
4309
4310#define STRINGLIB_LEN PyUnicode_GET_SIZE
4311#define STRINGLIB_NEW PyUnicode_FromUnicode
4312#define STRINGLIB_STR PyUnicode_AS_UNICODE
4313
4314Py_LOCAL_INLINE(int)
4315STRINGLIB_CMP(const Py_UNICODE* str, const Py_UNICODE* other, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004316{
Thomas Wouters477c8d52006-05-27 19:21:47 +00004317 if (str[0] != other[0])
4318 return 1;
4319 return memcmp((void*) str, (void*) other, len * sizeof(Py_UNICODE));
Guido van Rossumd57fd912000-03-10 22:53:23 +00004320}
4321
Thomas Wouters477c8d52006-05-27 19:21:47 +00004322#define STRINGLIB_EMPTY unicode_empty
4323
4324#include "stringlib/fastsearch.h"
4325
4326#include "stringlib/count.h"
4327#include "stringlib/find.h"
4328#include "stringlib/partition.h"
4329
4330/* helper macro to fixup start/end slice values */
4331#define FIX_START_END(obj) \
4332 if (start < 0) \
4333 start += (obj)->length; \
4334 if (start < 0) \
4335 start = 0; \
4336 if (end > (obj)->length) \
4337 end = (obj)->length; \
4338 if (end < 0) \
4339 end += (obj)->length; \
4340 if (end < 0) \
4341 end = 0;
4342
Martin v. Löwis18e16552006-02-15 17:27:45 +00004343Py_ssize_t PyUnicode_Count(PyObject *str,
Thomas Wouters477c8d52006-05-27 19:21:47 +00004344 PyObject *substr,
4345 Py_ssize_t start,
4346 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004347{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004348 Py_ssize_t result;
Thomas Wouters477c8d52006-05-27 19:21:47 +00004349 PyUnicodeObject* str_obj;
4350 PyUnicodeObject* sub_obj;
Tim Petersced69f82003-09-16 20:30:58 +00004351
Thomas Wouters477c8d52006-05-27 19:21:47 +00004352 str_obj = (PyUnicodeObject*) PyUnicode_FromObject(str);
4353 if (!str_obj)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004354 return -1;
Thomas Wouters477c8d52006-05-27 19:21:47 +00004355 sub_obj = (PyUnicodeObject*) PyUnicode_FromObject(substr);
4356 if (!sub_obj) {
4357 Py_DECREF(str_obj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004358 return -1;
4359 }
Tim Petersced69f82003-09-16 20:30:58 +00004360
Thomas Wouters477c8d52006-05-27 19:21:47 +00004361 FIX_START_END(str_obj);
Tim Petersced69f82003-09-16 20:30:58 +00004362
Thomas Wouters477c8d52006-05-27 19:21:47 +00004363 result = stringlib_count(
4364 str_obj->str + start, end - start, sub_obj->str, sub_obj->length
4365 );
4366
4367 Py_DECREF(sub_obj);
4368 Py_DECREF(str_obj);
4369
Guido van Rossumd57fd912000-03-10 22:53:23 +00004370 return result;
4371}
4372
Martin v. Löwis18e16552006-02-15 17:27:45 +00004373Py_ssize_t PyUnicode_Find(PyObject *str,
Thomas Wouters477c8d52006-05-27 19:21:47 +00004374 PyObject *sub,
4375 Py_ssize_t start,
4376 Py_ssize_t end,
4377 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004378{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004379 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00004380
Guido van Rossumd57fd912000-03-10 22:53:23 +00004381 str = PyUnicode_FromObject(str);
Thomas Wouters477c8d52006-05-27 19:21:47 +00004382 if (!str)
Marc-André Lemburg4da6fd62002-05-29 11:33:13 +00004383 return -2;
Thomas Wouters477c8d52006-05-27 19:21:47 +00004384 sub = PyUnicode_FromObject(sub);
4385 if (!sub) {
Marc-André Lemburg41644392002-05-29 13:46:29 +00004386 Py_DECREF(str);
Marc-André Lemburg4da6fd62002-05-29 11:33:13 +00004387 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004388 }
Tim Petersced69f82003-09-16 20:30:58 +00004389
Thomas Wouters477c8d52006-05-27 19:21:47 +00004390 if (direction > 0)
4391 result = stringlib_find_slice(
4392 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
4393 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
4394 start, end
4395 );
4396 else
4397 result = stringlib_rfind_slice(
4398 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
4399 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
4400 start, end
4401 );
4402
Guido van Rossumd57fd912000-03-10 22:53:23 +00004403 Py_DECREF(str);
Thomas Wouters477c8d52006-05-27 19:21:47 +00004404 Py_DECREF(sub);
4405
Guido van Rossumd57fd912000-03-10 22:53:23 +00004406 return result;
4407}
4408
Tim Petersced69f82003-09-16 20:30:58 +00004409static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004410int tailmatch(PyUnicodeObject *self,
4411 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004412 Py_ssize_t start,
4413 Py_ssize_t end,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004414 int direction)
4415{
Guido van Rossumd57fd912000-03-10 22:53:23 +00004416 if (substring->length == 0)
4417 return 1;
4418
Thomas Wouters477c8d52006-05-27 19:21:47 +00004419 FIX_START_END(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004420
4421 end -= substring->length;
4422 if (end < start)
4423 return 0;
4424
4425 if (direction > 0) {
4426 if (Py_UNICODE_MATCH(self, end, substring))
4427 return 1;
4428 } else {
4429 if (Py_UNICODE_MATCH(self, start, substring))
4430 return 1;
4431 }
4432
4433 return 0;
4434}
4435
Martin v. Löwis18e16552006-02-15 17:27:45 +00004436Py_ssize_t PyUnicode_Tailmatch(PyObject *str,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004437 PyObject *substr,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004438 Py_ssize_t start,
4439 Py_ssize_t end,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004440 int direction)
4441{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004442 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00004443
Guido van Rossumd57fd912000-03-10 22:53:23 +00004444 str = PyUnicode_FromObject(str);
4445 if (str == NULL)
4446 return -1;
4447 substr = PyUnicode_FromObject(substr);
4448 if (substr == NULL) {
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00004449 Py_DECREF(str);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004450 return -1;
4451 }
Tim Petersced69f82003-09-16 20:30:58 +00004452
Guido van Rossumd57fd912000-03-10 22:53:23 +00004453 result = tailmatch((PyUnicodeObject *)str,
4454 (PyUnicodeObject *)substr,
4455 start, end, direction);
4456 Py_DECREF(str);
4457 Py_DECREF(substr);
4458 return result;
4459}
4460
Guido van Rossumd57fd912000-03-10 22:53:23 +00004461/* Apply fixfct filter to the Unicode object self and return a
4462 reference to the modified object */
4463
Tim Petersced69f82003-09-16 20:30:58 +00004464static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004465PyObject *fixup(PyUnicodeObject *self,
4466 int (*fixfct)(PyUnicodeObject *s))
4467{
4468
4469 PyUnicodeObject *u;
4470
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004471 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004472 if (u == NULL)
4473 return NULL;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004474
4475 Py_UNICODE_COPY(u->str, self->str, self->length);
4476
Tim Peters7a29bd52001-09-12 03:03:31 +00004477 if (!fixfct(u) && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004478 /* fixfct should return TRUE if it modified the buffer. If
4479 FALSE, return a reference to the original buffer instead
4480 (to save space, not time) */
4481 Py_INCREF(self);
4482 Py_DECREF(u);
4483 return (PyObject*) self;
4484 }
4485 return (PyObject*) u;
4486}
4487
Tim Petersced69f82003-09-16 20:30:58 +00004488static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004489int fixupper(PyUnicodeObject *self)
4490{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004491 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004492 Py_UNICODE *s = self->str;
4493 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00004494
Guido van Rossumd57fd912000-03-10 22:53:23 +00004495 while (len-- > 0) {
4496 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00004497
Guido van Rossumd57fd912000-03-10 22:53:23 +00004498 ch = Py_UNICODE_TOUPPER(*s);
4499 if (ch != *s) {
4500 status = 1;
4501 *s = ch;
4502 }
4503 s++;
4504 }
4505
4506 return status;
4507}
4508
Tim Petersced69f82003-09-16 20:30:58 +00004509static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004510int fixlower(PyUnicodeObject *self)
4511{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004512 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004513 Py_UNICODE *s = self->str;
4514 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00004515
Guido van Rossumd57fd912000-03-10 22:53:23 +00004516 while (len-- > 0) {
4517 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00004518
Guido van Rossumd57fd912000-03-10 22:53:23 +00004519 ch = Py_UNICODE_TOLOWER(*s);
4520 if (ch != *s) {
4521 status = 1;
4522 *s = ch;
4523 }
4524 s++;
4525 }
4526
4527 return status;
4528}
4529
Tim Petersced69f82003-09-16 20:30:58 +00004530static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004531int fixswapcase(PyUnicodeObject *self)
4532{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004533 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004534 Py_UNICODE *s = self->str;
4535 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00004536
Guido van Rossumd57fd912000-03-10 22:53:23 +00004537 while (len-- > 0) {
4538 if (Py_UNICODE_ISUPPER(*s)) {
4539 *s = Py_UNICODE_TOLOWER(*s);
4540 status = 1;
4541 } else if (Py_UNICODE_ISLOWER(*s)) {
4542 *s = Py_UNICODE_TOUPPER(*s);
4543 status = 1;
4544 }
4545 s++;
4546 }
4547
4548 return status;
4549}
4550
Tim Petersced69f82003-09-16 20:30:58 +00004551static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004552int fixcapitalize(PyUnicodeObject *self)
4553{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004554 Py_ssize_t len = self->length;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00004555 Py_UNICODE *s = self->str;
4556 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00004557
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00004558 if (len == 0)
4559 return 0;
4560 if (Py_UNICODE_ISLOWER(*s)) {
4561 *s = Py_UNICODE_TOUPPER(*s);
4562 status = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004563 }
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00004564 s++;
4565 while (--len > 0) {
4566 if (Py_UNICODE_ISUPPER(*s)) {
4567 *s = Py_UNICODE_TOLOWER(*s);
4568 status = 1;
4569 }
4570 s++;
4571 }
4572 return status;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004573}
4574
4575static
4576int fixtitle(PyUnicodeObject *self)
4577{
4578 register Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
4579 register Py_UNICODE *e;
4580 int previous_is_cased;
4581
4582 /* Shortcut for single character strings */
4583 if (PyUnicode_GET_SIZE(self) == 1) {
4584 Py_UNICODE ch = Py_UNICODE_TOTITLE(*p);
4585 if (*p != ch) {
4586 *p = ch;
4587 return 1;
4588 }
4589 else
4590 return 0;
4591 }
Tim Petersced69f82003-09-16 20:30:58 +00004592
Guido van Rossumd57fd912000-03-10 22:53:23 +00004593 e = p + PyUnicode_GET_SIZE(self);
4594 previous_is_cased = 0;
4595 for (; p < e; p++) {
4596 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00004597
Guido van Rossumd57fd912000-03-10 22:53:23 +00004598 if (previous_is_cased)
4599 *p = Py_UNICODE_TOLOWER(ch);
4600 else
4601 *p = Py_UNICODE_TOTITLE(ch);
Tim Petersced69f82003-09-16 20:30:58 +00004602
4603 if (Py_UNICODE_ISLOWER(ch) ||
4604 Py_UNICODE_ISUPPER(ch) ||
Guido van Rossumd57fd912000-03-10 22:53:23 +00004605 Py_UNICODE_ISTITLE(ch))
4606 previous_is_cased = 1;
4607 else
4608 previous_is_cased = 0;
4609 }
4610 return 1;
4611}
4612
Tim Peters8ce9f162004-08-27 01:49:32 +00004613PyObject *
4614PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004615{
Tim Peters8ce9f162004-08-27 01:49:32 +00004616 PyObject *internal_separator = NULL;
Skip Montanaro6543b452004-09-16 03:28:13 +00004617 const Py_UNICODE blank = ' ';
4618 const Py_UNICODE *sep = &blank;
Thomas Wouters477c8d52006-05-27 19:21:47 +00004619 Py_ssize_t seplen = 1;
Tim Peters05eba1f2004-08-27 21:32:02 +00004620 PyUnicodeObject *res = NULL; /* the result */
Thomas Wouters477c8d52006-05-27 19:21:47 +00004621 Py_ssize_t res_alloc = 100; /* # allocated bytes for string in res */
4622 Py_ssize_t res_used; /* # used bytes */
Tim Peters05eba1f2004-08-27 21:32:02 +00004623 Py_UNICODE *res_p; /* pointer to free byte in res's string area */
4624 PyObject *fseq; /* PySequence_Fast(seq) */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004625 Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */
Tim Peters8ce9f162004-08-27 01:49:32 +00004626 PyObject *item;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004627 Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004628
Tim Peters05eba1f2004-08-27 21:32:02 +00004629 fseq = PySequence_Fast(seq, "");
4630 if (fseq == NULL) {
Tim Peters05eba1f2004-08-27 21:32:02 +00004631 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +00004632 }
4633
Tim Peters91879ab2004-08-27 22:35:44 +00004634 /* Grrrr. A codec may be invoked to convert str objects to
4635 * Unicode, and so it's possible to call back into Python code
4636 * during PyUnicode_FromObject(), and so it's possible for a sick
4637 * codec to change the size of fseq (if seq is a list). Therefore
4638 * we have to keep refetching the size -- can't assume seqlen
4639 * is invariant.
4640 */
Tim Peters05eba1f2004-08-27 21:32:02 +00004641 seqlen = PySequence_Fast_GET_SIZE(fseq);
4642 /* If empty sequence, return u"". */
4643 if (seqlen == 0) {
4644 res = _PyUnicode_New(0); /* empty sequence; return u"" */
4645 goto Done;
4646 }
4647 /* If singleton sequence with an exact Unicode, return that. */
4648 if (seqlen == 1) {
4649 item = PySequence_Fast_GET_ITEM(fseq, 0);
4650 if (PyUnicode_CheckExact(item)) {
4651 Py_INCREF(item);
4652 res = (PyUnicodeObject *)item;
4653 goto Done;
4654 }
Tim Peters8ce9f162004-08-27 01:49:32 +00004655 }
4656
Tim Peters05eba1f2004-08-27 21:32:02 +00004657 /* At least two items to join, or one that isn't exact Unicode. */
4658 if (seqlen > 1) {
4659 /* Set up sep and seplen -- they're needed. */
4660 if (separator == NULL) {
4661 sep = &blank;
4662 seplen = 1;
4663 }
4664 else {
4665 internal_separator = PyUnicode_FromObject(separator);
4666 if (internal_separator == NULL)
4667 goto onError;
4668 sep = PyUnicode_AS_UNICODE(internal_separator);
4669 seplen = PyUnicode_GET_SIZE(internal_separator);
Tim Peters91879ab2004-08-27 22:35:44 +00004670 /* In case PyUnicode_FromObject() mutated seq. */
4671 seqlen = PySequence_Fast_GET_SIZE(fseq);
Tim Peters05eba1f2004-08-27 21:32:02 +00004672 }
4673 }
4674
4675 /* Get space. */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004676 res = _PyUnicode_New(res_alloc);
Tim Peters05eba1f2004-08-27 21:32:02 +00004677 if (res == NULL)
Tim Peters8ce9f162004-08-27 01:49:32 +00004678 goto onError;
Tim Peters05eba1f2004-08-27 21:32:02 +00004679 res_p = PyUnicode_AS_UNICODE(res);
4680 res_used = 0;
Tim Peters8ce9f162004-08-27 01:49:32 +00004681
Tim Peters05eba1f2004-08-27 21:32:02 +00004682 for (i = 0; i < seqlen; ++i) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00004683 Py_ssize_t itemlen;
4684 Py_ssize_t new_res_used;
Tim Peters05eba1f2004-08-27 21:32:02 +00004685
4686 item = PySequence_Fast_GET_ITEM(fseq, i);
4687 /* Convert item to Unicode. */
4688 if (! PyUnicode_Check(item) && ! PyString_Check(item)) {
4689 PyErr_Format(PyExc_TypeError,
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004690 "sequence item %zd: expected string or Unicode,"
Tim Peters05eba1f2004-08-27 21:32:02 +00004691 " %.80s found",
4692 i, item->ob_type->tp_name);
Tim Peters2cfe3682001-05-05 05:36:48 +00004693 goto onError;
Tim Peters8ce9f162004-08-27 01:49:32 +00004694 }
Tim Peters05eba1f2004-08-27 21:32:02 +00004695 item = PyUnicode_FromObject(item);
4696 if (item == NULL)
4697 goto onError;
4698 /* We own a reference to item from here on. */
Tim Petersced69f82003-09-16 20:30:58 +00004699
Tim Peters91879ab2004-08-27 22:35:44 +00004700 /* In case PyUnicode_FromObject() mutated seq. */
4701 seqlen = PySequence_Fast_GET_SIZE(fseq);
4702
Tim Peters8ce9f162004-08-27 01:49:32 +00004703 /* Make sure we have enough space for the separator and the item. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004704 itemlen = PyUnicode_GET_SIZE(item);
Tim Peters05eba1f2004-08-27 21:32:02 +00004705 new_res_used = res_used + itemlen;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004706 if (new_res_used < 0)
Tim Peters8ce9f162004-08-27 01:49:32 +00004707 goto Overflow;
Tim Peters05eba1f2004-08-27 21:32:02 +00004708 if (i < seqlen - 1) {
4709 new_res_used += seplen;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004710 if (new_res_used < 0)
Tim Peters05eba1f2004-08-27 21:32:02 +00004711 goto Overflow;
4712 }
4713 if (new_res_used > res_alloc) {
4714 /* double allocated size until it's big enough */
Tim Peters8ce9f162004-08-27 01:49:32 +00004715 do {
Tim Peters05eba1f2004-08-27 21:32:02 +00004716 res_alloc += res_alloc;
Thomas Wouters477c8d52006-05-27 19:21:47 +00004717 if (res_alloc <= 0)
Tim Peters8ce9f162004-08-27 01:49:32 +00004718 goto Overflow;
Tim Peters05eba1f2004-08-27 21:32:02 +00004719 } while (new_res_used > res_alloc);
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004720 if (_PyUnicode_Resize(&res, res_alloc) < 0) {
Marc-André Lemburg3508e302001-09-20 17:22:58 +00004721 Py_DECREF(item);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004722 goto onError;
Marc-André Lemburg3508e302001-09-20 17:22:58 +00004723 }
Tim Peters05eba1f2004-08-27 21:32:02 +00004724 res_p = PyUnicode_AS_UNICODE(res) + res_used;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004725 }
Tim Peters05eba1f2004-08-27 21:32:02 +00004726
4727 /* Copy item, and maybe the separator. */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004728 Py_UNICODE_COPY(res_p, PyUnicode_AS_UNICODE(item), itemlen);
Tim Peters05eba1f2004-08-27 21:32:02 +00004729 res_p += itemlen;
4730 if (i < seqlen - 1) {
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004731 Py_UNICODE_COPY(res_p, sep, seplen);
Tim Peters05eba1f2004-08-27 21:32:02 +00004732 res_p += seplen;
4733 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004734 Py_DECREF(item);
Tim Peters05eba1f2004-08-27 21:32:02 +00004735 res_used = new_res_used;
4736 }
Tim Peters8ce9f162004-08-27 01:49:32 +00004737
Tim Peters05eba1f2004-08-27 21:32:02 +00004738 /* Shrink res to match the used area; this probably can't fail,
4739 * but it's cheap to check.
4740 */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004741 if (_PyUnicode_Resize(&res, res_used) < 0)
Tim Peters8ce9f162004-08-27 01:49:32 +00004742 goto onError;
4743
4744 Done:
4745 Py_XDECREF(internal_separator);
Tim Peters05eba1f2004-08-27 21:32:02 +00004746 Py_DECREF(fseq);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004747 return (PyObject *)res;
4748
Tim Peters8ce9f162004-08-27 01:49:32 +00004749 Overflow:
4750 PyErr_SetString(PyExc_OverflowError,
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004751 "join() result is too long for a Python string");
Tim Peters8ce9f162004-08-27 01:49:32 +00004752 Py_DECREF(item);
4753 /* fall through */
4754
Guido van Rossumd57fd912000-03-10 22:53:23 +00004755 onError:
Tim Peters8ce9f162004-08-27 01:49:32 +00004756 Py_XDECREF(internal_separator);
Tim Peters05eba1f2004-08-27 21:32:02 +00004757 Py_DECREF(fseq);
Tim Peters8ce9f162004-08-27 01:49:32 +00004758 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004759 return NULL;
4760}
4761
Tim Petersced69f82003-09-16 20:30:58 +00004762static
4763PyUnicodeObject *pad(PyUnicodeObject *self,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004764 Py_ssize_t left,
4765 Py_ssize_t right,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004766 Py_UNICODE fill)
4767{
4768 PyUnicodeObject *u;
4769
4770 if (left < 0)
4771 left = 0;
4772 if (right < 0)
4773 right = 0;
4774
Tim Peters7a29bd52001-09-12 03:03:31 +00004775 if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004776 Py_INCREF(self);
4777 return self;
4778 }
4779
4780 u = _PyUnicode_New(left + self->length + right);
4781 if (u) {
4782 if (left)
4783 Py_UNICODE_FILL(u->str, fill, left);
4784 Py_UNICODE_COPY(u->str + left, self->str, self->length);
4785 if (right)
4786 Py_UNICODE_FILL(u->str + left + self->length, fill, right);
4787 }
4788
4789 return u;
4790}
4791
4792#define SPLIT_APPEND(data, left, right) \
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004793 str = PyUnicode_FromUnicode((data) + (left), (right) - (left)); \
Guido van Rossumd57fd912000-03-10 22:53:23 +00004794 if (!str) \
4795 goto onError; \
4796 if (PyList_Append(list, str)) { \
4797 Py_DECREF(str); \
4798 goto onError; \
4799 } \
4800 else \
4801 Py_DECREF(str);
4802
4803static
4804PyObject *split_whitespace(PyUnicodeObject *self,
4805 PyObject *list,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004806 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004807{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004808 register Py_ssize_t i;
4809 register Py_ssize_t j;
4810 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004811 PyObject *str;
4812
4813 for (i = j = 0; i < len; ) {
4814 /* find a token */
4815 while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
4816 i++;
4817 j = i;
4818 while (i < len && !Py_UNICODE_ISSPACE(self->str[i]))
4819 i++;
4820 if (j < i) {
4821 if (maxcount-- <= 0)
4822 break;
4823 SPLIT_APPEND(self->str, j, i);
4824 while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
4825 i++;
4826 j = i;
4827 }
4828 }
4829 if (j < len) {
4830 SPLIT_APPEND(self->str, j, len);
4831 }
4832 return list;
4833
4834 onError:
4835 Py_DECREF(list);
4836 return NULL;
4837}
4838
4839PyObject *PyUnicode_Splitlines(PyObject *string,
Guido van Rossum86662912000-04-11 15:38:46 +00004840 int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004841{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004842 register Py_ssize_t i;
4843 register Py_ssize_t j;
4844 Py_ssize_t len;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004845 PyObject *list;
4846 PyObject *str;
4847 Py_UNICODE *data;
4848
4849 string = PyUnicode_FromObject(string);
4850 if (string == NULL)
4851 return NULL;
4852 data = PyUnicode_AS_UNICODE(string);
4853 len = PyUnicode_GET_SIZE(string);
4854
Guido van Rossumd57fd912000-03-10 22:53:23 +00004855 list = PyList_New(0);
4856 if (!list)
4857 goto onError;
4858
4859 for (i = j = 0; i < len; ) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00004860 Py_ssize_t eol;
Tim Petersced69f82003-09-16 20:30:58 +00004861
Guido van Rossumd57fd912000-03-10 22:53:23 +00004862 /* Find a line and append it */
Thomas Wouters477c8d52006-05-27 19:21:47 +00004863 while (i < len && !BLOOM_LINEBREAK(data[i]))
Guido van Rossumd57fd912000-03-10 22:53:23 +00004864 i++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004865
4866 /* Skip the line break reading CRLF as one line break */
Guido van Rossum86662912000-04-11 15:38:46 +00004867 eol = i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004868 if (i < len) {
4869 if (data[i] == '\r' && i + 1 < len &&
4870 data[i+1] == '\n')
4871 i += 2;
4872 else
4873 i++;
Guido van Rossum86662912000-04-11 15:38:46 +00004874 if (keepends)
4875 eol = i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004876 }
Guido van Rossum86662912000-04-11 15:38:46 +00004877 SPLIT_APPEND(data, j, eol);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004878 j = i;
4879 }
4880 if (j < len) {
4881 SPLIT_APPEND(data, j, len);
4882 }
4883
4884 Py_DECREF(string);
4885 return list;
4886
4887 onError:
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00004888 Py_XDECREF(list);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004889 Py_DECREF(string);
4890 return NULL;
4891}
4892
Tim Petersced69f82003-09-16 20:30:58 +00004893static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004894PyObject *split_char(PyUnicodeObject *self,
4895 PyObject *list,
4896 Py_UNICODE ch,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004897 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004898{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004899 register Py_ssize_t i;
4900 register Py_ssize_t j;
4901 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004902 PyObject *str;
4903
4904 for (i = j = 0; i < len; ) {
4905 if (self->str[i] == ch) {
4906 if (maxcount-- <= 0)
4907 break;
4908 SPLIT_APPEND(self->str, j, i);
4909 i = j = i + 1;
4910 } else
4911 i++;
4912 }
4913 if (j <= len) {
4914 SPLIT_APPEND(self->str, j, len);
4915 }
4916 return list;
4917
4918 onError:
4919 Py_DECREF(list);
4920 return NULL;
4921}
4922
Tim Petersced69f82003-09-16 20:30:58 +00004923static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004924PyObject *split_substring(PyUnicodeObject *self,
4925 PyObject *list,
4926 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004927 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004928{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004929 register Py_ssize_t i;
4930 register Py_ssize_t j;
4931 Py_ssize_t len = self->length;
4932 Py_ssize_t sublen = substring->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004933 PyObject *str;
4934
Guido van Rossumcda4f9a2000-12-19 02:23:19 +00004935 for (i = j = 0; i <= len - sublen; ) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004936 if (Py_UNICODE_MATCH(self, i, substring)) {
4937 if (maxcount-- <= 0)
4938 break;
4939 SPLIT_APPEND(self->str, j, i);
4940 i = j = i + sublen;
4941 } else
4942 i++;
4943 }
4944 if (j <= len) {
4945 SPLIT_APPEND(self->str, j, len);
4946 }
4947 return list;
4948
4949 onError:
4950 Py_DECREF(list);
4951 return NULL;
4952}
4953
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004954static
4955PyObject *rsplit_whitespace(PyUnicodeObject *self,
4956 PyObject *list,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004957 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004958{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004959 register Py_ssize_t i;
4960 register Py_ssize_t j;
4961 Py_ssize_t len = self->length;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004962 PyObject *str;
4963
4964 for (i = j = len - 1; i >= 0; ) {
4965 /* find a token */
4966 while (i >= 0 && Py_UNICODE_ISSPACE(self->str[i]))
4967 i--;
4968 j = i;
4969 while (i >= 0 && !Py_UNICODE_ISSPACE(self->str[i]))
4970 i--;
4971 if (j > i) {
4972 if (maxcount-- <= 0)
4973 break;
Thomas Wouters477c8d52006-05-27 19:21:47 +00004974 SPLIT_APPEND(self->str, i + 1, j + 1);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004975 while (i >= 0 && Py_UNICODE_ISSPACE(self->str[i]))
4976 i--;
4977 j = i;
4978 }
4979 }
4980 if (j >= 0) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00004981 SPLIT_APPEND(self->str, 0, j + 1);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004982 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00004983 if (PyList_Reverse(list) < 0)
4984 goto onError;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004985 return list;
4986
4987 onError:
4988 Py_DECREF(list);
4989 return NULL;
4990}
4991
4992static
4993PyObject *rsplit_char(PyUnicodeObject *self,
4994 PyObject *list,
4995 Py_UNICODE ch,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004996 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004997{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004998 register Py_ssize_t i;
4999 register Py_ssize_t j;
5000 Py_ssize_t len = self->length;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005001 PyObject *str;
5002
5003 for (i = j = len - 1; i >= 0; ) {
5004 if (self->str[i] == ch) {
5005 if (maxcount-- <= 0)
5006 break;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005007 SPLIT_APPEND(self->str, i + 1, j + 1);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005008 j = i = i - 1;
5009 } else
5010 i--;
5011 }
Hye-Shik Chang7fc4cf52003-12-23 09:10:16 +00005012 if (j >= -1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00005013 SPLIT_APPEND(self->str, 0, j + 1);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005014 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00005015 if (PyList_Reverse(list) < 0)
5016 goto onError;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005017 return list;
5018
5019 onError:
5020 Py_DECREF(list);
5021 return NULL;
5022}
5023
5024static
5025PyObject *rsplit_substring(PyUnicodeObject *self,
5026 PyObject *list,
5027 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005028 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005029{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005030 register Py_ssize_t i;
5031 register Py_ssize_t j;
5032 Py_ssize_t len = self->length;
5033 Py_ssize_t sublen = substring->length;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005034 PyObject *str;
5035
5036 for (i = len - sublen, j = len; i >= 0; ) {
5037 if (Py_UNICODE_MATCH(self, i, substring)) {
5038 if (maxcount-- <= 0)
5039 break;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005040 SPLIT_APPEND(self->str, i + sublen, j);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005041 j = i;
5042 i -= sublen;
5043 } else
5044 i--;
5045 }
5046 if (j >= 0) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00005047 SPLIT_APPEND(self->str, 0, j);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005048 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00005049 if (PyList_Reverse(list) < 0)
5050 goto onError;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005051 return list;
5052
5053 onError:
5054 Py_DECREF(list);
5055 return NULL;
5056}
5057
Guido van Rossumd57fd912000-03-10 22:53:23 +00005058#undef SPLIT_APPEND
5059
5060static
5061PyObject *split(PyUnicodeObject *self,
5062 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005063 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005064{
5065 PyObject *list;
5066
5067 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005068 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005069
5070 list = PyList_New(0);
5071 if (!list)
5072 return NULL;
5073
5074 if (substring == NULL)
5075 return split_whitespace(self,list,maxcount);
5076
5077 else if (substring->length == 1)
5078 return split_char(self,list,substring->str[0],maxcount);
5079
5080 else if (substring->length == 0) {
5081 Py_DECREF(list);
5082 PyErr_SetString(PyExc_ValueError, "empty separator");
5083 return NULL;
5084 }
5085 else
5086 return split_substring(self,list,substring,maxcount);
5087}
5088
Tim Petersced69f82003-09-16 20:30:58 +00005089static
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005090PyObject *rsplit(PyUnicodeObject *self,
5091 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005092 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005093{
5094 PyObject *list;
5095
5096 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005097 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005098
5099 list = PyList_New(0);
5100 if (!list)
5101 return NULL;
5102
5103 if (substring == NULL)
5104 return rsplit_whitespace(self,list,maxcount);
5105
5106 else if (substring->length == 1)
5107 return rsplit_char(self,list,substring->str[0],maxcount);
5108
5109 else if (substring->length == 0) {
5110 Py_DECREF(list);
5111 PyErr_SetString(PyExc_ValueError, "empty separator");
5112 return NULL;
5113 }
5114 else
5115 return rsplit_substring(self,list,substring,maxcount);
5116}
5117
5118static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005119PyObject *replace(PyUnicodeObject *self,
5120 PyUnicodeObject *str1,
5121 PyUnicodeObject *str2,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005122 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005123{
5124 PyUnicodeObject *u;
5125
5126 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005127 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005128
Thomas Wouters477c8d52006-05-27 19:21:47 +00005129 if (str1->length == str2->length) {
5130 /* same length */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005131 Py_ssize_t i;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005132 if (str1->length == 1) {
5133 /* replace characters */
5134 Py_UNICODE u1, u2;
5135 if (!findchar(self->str, self->length, str1->str[0]))
5136 goto nothing;
5137 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
5138 if (!u)
5139 return NULL;
5140 Py_UNICODE_COPY(u->str, self->str, self->length);
5141 u1 = str1->str[0];
5142 u2 = str2->str[0];
5143 for (i = 0; i < u->length; i++)
5144 if (u->str[i] == u1) {
5145 if (--maxcount < 0)
5146 break;
5147 u->str[i] = u2;
5148 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005149 } else {
Thomas Wouters477c8d52006-05-27 19:21:47 +00005150 i = fastsearch(
5151 self->str, self->length, str1->str, str1->length, FAST_SEARCH
Guido van Rossumd57fd912000-03-10 22:53:23 +00005152 );
Thomas Wouters477c8d52006-05-27 19:21:47 +00005153 if (i < 0)
5154 goto nothing;
5155 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
5156 if (!u)
5157 return NULL;
5158 Py_UNICODE_COPY(u->str, self->str, self->length);
5159 while (i <= self->length - str1->length)
5160 if (Py_UNICODE_MATCH(self, i, str1)) {
5161 if (--maxcount < 0)
5162 break;
5163 Py_UNICODE_COPY(u->str+i, str2->str, str2->length);
5164 i += str1->length;
5165 } else
5166 i++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005167 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005168 } else {
Thomas Wouters477c8d52006-05-27 19:21:47 +00005169
5170 Py_ssize_t n, i, j, e;
5171 Py_ssize_t product, new_size, delta;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005172 Py_UNICODE *p;
5173
5174 /* replace strings */
Thomas Wouters477c8d52006-05-27 19:21:47 +00005175 n = stringlib_count(self->str, self->length, str1->str, str1->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005176 if (n > maxcount)
5177 n = maxcount;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005178 if (n == 0)
5179 goto nothing;
5180 /* new_size = self->length + n * (str2->length - str1->length)); */
5181 delta = (str2->length - str1->length);
5182 if (delta == 0) {
5183 new_size = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005184 } else {
Thomas Wouters477c8d52006-05-27 19:21:47 +00005185 product = n * (str2->length - str1->length);
5186 if ((product / (str2->length - str1->length)) != n) {
5187 PyErr_SetString(PyExc_OverflowError,
5188 "replace string is too long");
5189 return NULL;
5190 }
5191 new_size = self->length + product;
5192 if (new_size < 0) {
5193 PyErr_SetString(PyExc_OverflowError,
5194 "replace string is too long");
5195 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005196 }
5197 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00005198 u = _PyUnicode_New(new_size);
5199 if (!u)
5200 return NULL;
5201 i = 0;
5202 p = u->str;
5203 e = self->length - str1->length;
5204 if (str1->length > 0) {
5205 while (n-- > 0) {
5206 /* look for next match */
5207 j = i;
5208 while (j <= e) {
5209 if (Py_UNICODE_MATCH(self, j, str1))
5210 break;
5211 j++;
5212 }
5213 if (j > i) {
5214 if (j > e)
5215 break;
5216 /* copy unchanged part [i:j] */
5217 Py_UNICODE_COPY(p, self->str+i, j-i);
5218 p += j - i;
5219 }
5220 /* copy substitution string */
5221 if (str2->length > 0) {
5222 Py_UNICODE_COPY(p, str2->str, str2->length);
5223 p += str2->length;
5224 }
5225 i = j + str1->length;
5226 }
5227 if (i < self->length)
5228 /* copy tail [i:] */
5229 Py_UNICODE_COPY(p, self->str+i, self->length-i);
5230 } else {
5231 /* interleave */
5232 while (n > 0) {
5233 Py_UNICODE_COPY(p, str2->str, str2->length);
5234 p += str2->length;
5235 if (--n <= 0)
5236 break;
5237 *p++ = self->str[i++];
5238 }
5239 Py_UNICODE_COPY(p, self->str+i, self->length-i);
5240 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005241 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005242 return (PyObject *) u;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005243
5244nothing:
5245 /* nothing to replace; return original string (when possible) */
5246 if (PyUnicode_CheckExact(self)) {
5247 Py_INCREF(self);
5248 return (PyObject *) self;
5249 }
5250 return PyUnicode_FromUnicode(self->str, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005251}
5252
5253/* --- Unicode Object Methods --------------------------------------------- */
5254
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005255PyDoc_STRVAR(title__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005256"S.title() -> unicode\n\
5257\n\
5258Return a titlecased version of S, i.e. words start with title case\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005259characters, all remaining cased characters have lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005260
5261static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005262unicode_title(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005263{
Guido van Rossumd57fd912000-03-10 22:53:23 +00005264 return fixup(self, fixtitle);
5265}
5266
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005267PyDoc_STRVAR(capitalize__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005268"S.capitalize() -> unicode\n\
5269\n\
5270Return a capitalized version of S, i.e. make the first character\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005271have upper case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005272
5273static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005274unicode_capitalize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005275{
Guido van Rossumd57fd912000-03-10 22:53:23 +00005276 return fixup(self, fixcapitalize);
5277}
5278
5279#if 0
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005280PyDoc_STRVAR(capwords__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005281"S.capwords() -> unicode\n\
5282\n\
5283Apply .capitalize() to all words in S and return the result with\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005284normalized whitespace (all whitespace strings are replaced by ' ').");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005285
5286static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005287unicode_capwords(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005288{
5289 PyObject *list;
5290 PyObject *item;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005291 Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005292
Guido van Rossumd57fd912000-03-10 22:53:23 +00005293 /* Split into words */
5294 list = split(self, NULL, -1);
5295 if (!list)
5296 return NULL;
5297
5298 /* Capitalize each word */
5299 for (i = 0; i < PyList_GET_SIZE(list); i++) {
5300 item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
5301 fixcapitalize);
5302 if (item == NULL)
5303 goto onError;
5304 Py_DECREF(PyList_GET_ITEM(list, i));
5305 PyList_SET_ITEM(list, i, item);
5306 }
5307
5308 /* Join the words to form a new string */
5309 item = PyUnicode_Join(NULL, list);
5310
5311onError:
5312 Py_DECREF(list);
5313 return (PyObject *)item;
5314}
5315#endif
5316
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00005317/* Argument converter. Coerces to a single unicode character */
5318
5319static int
5320convert_uc(PyObject *obj, void *addr)
5321{
5322 Py_UNICODE *fillcharloc = (Py_UNICODE *)addr;
5323 PyObject *uniobj;
5324 Py_UNICODE *unistr;
5325
5326 uniobj = PyUnicode_FromObject(obj);
5327 if (uniobj == NULL) {
5328 PyErr_SetString(PyExc_TypeError,
5329 "The fill character cannot be converted to Unicode");
5330 return 0;
5331 }
5332 if (PyUnicode_GET_SIZE(uniobj) != 1) {
5333 PyErr_SetString(PyExc_TypeError,
5334 "The fill character must be exactly one character long");
5335 Py_DECREF(uniobj);
5336 return 0;
5337 }
5338 unistr = PyUnicode_AS_UNICODE(uniobj);
5339 *fillcharloc = unistr[0];
5340 Py_DECREF(uniobj);
5341 return 1;
5342}
5343
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005344PyDoc_STRVAR(center__doc__,
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00005345"S.center(width[, fillchar]) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005346\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00005347Return S centered in a Unicode string of length width. Padding is\n\
5348done using the specified fill character (default is a space)");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005349
5350static PyObject *
5351unicode_center(PyUnicodeObject *self, PyObject *args)
5352{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005353 Py_ssize_t marg, left;
5354 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00005355 Py_UNICODE fillchar = ' ';
Guido van Rossumd57fd912000-03-10 22:53:23 +00005356
Thomas Woutersde017742006-02-16 19:34:37 +00005357 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005358 return NULL;
5359
Tim Peters7a29bd52001-09-12 03:03:31 +00005360 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005361 Py_INCREF(self);
5362 return (PyObject*) self;
5363 }
5364
5365 marg = width - self->length;
5366 left = marg / 2 + (marg & width & 1);
5367
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00005368 return (PyObject*) pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005369}
5370
Marc-André Lemburge5034372000-08-08 08:04:29 +00005371#if 0
5372
5373/* This code should go into some future Unicode collation support
5374 module. The basic comparison should compare ordinals on a naive
Trent Mick20abf572000-08-12 22:14:34 +00005375 basis (this is what Java does and thus JPython too). */
Marc-André Lemburge5034372000-08-08 08:04:29 +00005376
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005377/* speedy UTF-16 code point order comparison */
5378/* gleaned from: */
5379/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
5380
Marc-André Lemburge12896e2000-07-07 17:51:08 +00005381static short utf16Fixup[32] =
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005382{
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005383 0, 0, 0, 0, 0, 0, 0, 0,
Tim Petersced69f82003-09-16 20:30:58 +00005384 0, 0, 0, 0, 0, 0, 0, 0,
5385 0, 0, 0, 0, 0, 0, 0, 0,
Marc-André Lemburge12896e2000-07-07 17:51:08 +00005386 0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005387};
5388
Guido van Rossumd57fd912000-03-10 22:53:23 +00005389static int
5390unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
5391{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005392 Py_ssize_t len1, len2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005393
Guido van Rossumd57fd912000-03-10 22:53:23 +00005394 Py_UNICODE *s1 = str1->str;
5395 Py_UNICODE *s2 = str2->str;
5396
5397 len1 = str1->length;
5398 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00005399
Guido van Rossumd57fd912000-03-10 22:53:23 +00005400 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00005401 Py_UNICODE c1, c2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005402
5403 c1 = *s1++;
5404 c2 = *s2++;
Fredrik Lundh45714e92001-06-26 16:39:36 +00005405
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005406 if (c1 > (1<<11) * 26)
5407 c1 += utf16Fixup[c1>>11];
5408 if (c2 > (1<<11) * 26)
5409 c2 += utf16Fixup[c2>>11];
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005410 /* now c1 and c2 are in UTF-32-compatible order */
Fredrik Lundh45714e92001-06-26 16:39:36 +00005411
5412 if (c1 != c2)
5413 return (c1 < c2) ? -1 : 1;
Tim Petersced69f82003-09-16 20:30:58 +00005414
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005415 len1--; len2--;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005416 }
5417
5418 return (len1 < len2) ? -1 : (len1 != len2);
5419}
5420
Marc-André Lemburge5034372000-08-08 08:04:29 +00005421#else
5422
5423static int
5424unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
5425{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005426 register Py_ssize_t len1, len2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00005427
5428 Py_UNICODE *s1 = str1->str;
5429 Py_UNICODE *s2 = str2->str;
5430
5431 len1 = str1->length;
5432 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00005433
Marc-André Lemburge5034372000-08-08 08:04:29 +00005434 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00005435 Py_UNICODE c1, c2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00005436
Fredrik Lundh45714e92001-06-26 16:39:36 +00005437 c1 = *s1++;
5438 c2 = *s2++;
5439
5440 if (c1 != c2)
5441 return (c1 < c2) ? -1 : 1;
5442
Marc-André Lemburge5034372000-08-08 08:04:29 +00005443 len1--; len2--;
5444 }
5445
5446 return (len1 < len2) ? -1 : (len1 != len2);
5447}
5448
5449#endif
5450
Guido van Rossumd57fd912000-03-10 22:53:23 +00005451int PyUnicode_Compare(PyObject *left,
5452 PyObject *right)
5453{
Guido van Rossum09dc34f2007-05-04 04:17:33 +00005454 if (PyUnicode_Check(left) && PyUnicode_Check(right))
5455 return unicode_compare((PyUnicodeObject *)left,
5456 (PyUnicodeObject *)right);
5457 if ((PyString_Check(left) && PyUnicode_Check(right)) ||
5458 (PyUnicode_Check(left) && PyString_Check(right))) {
5459 if (PyUnicode_Check(left))
5460 left = _PyUnicode_AsDefaultEncodedString(left, NULL);
5461 if (PyUnicode_Check(right))
5462 right = _PyUnicode_AsDefaultEncodedString(right, NULL);
5463 assert(PyString_Check(left));
5464 assert(PyString_Check(right));
5465 return PyObject_Compare(left, right);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005466 }
Guido van Rossum09dc34f2007-05-04 04:17:33 +00005467 PyErr_Format(PyExc_TypeError,
5468 "Can't compare %.100s and %.100s",
5469 left->ob_type->tp_name,
5470 right->ob_type->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005471 return -1;
5472}
5473
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00005474PyObject *PyUnicode_RichCompare(PyObject *left,
5475 PyObject *right,
5476 int op)
5477{
5478 int result;
5479
5480 result = PyUnicode_Compare(left, right);
5481 if (result == -1 && PyErr_Occurred())
5482 goto onError;
5483
5484 /* Convert the return value to a Boolean */
5485 switch (op) {
5486 case Py_EQ:
5487 result = (result == 0);
5488 break;
5489 case Py_NE:
5490 result = (result != 0);
5491 break;
5492 case Py_LE:
5493 result = (result <= 0);
5494 break;
5495 case Py_GE:
5496 result = (result >= 0);
5497 break;
5498 case Py_LT:
5499 result = (result == -1);
5500 break;
5501 case Py_GT:
5502 result = (result == 1);
5503 break;
5504 }
5505 return PyBool_FromLong(result);
5506
5507 onError:
5508
5509 /* Standard case
5510
5511 Type errors mean that PyUnicode_FromObject() could not convert
5512 one of the arguments (usually the right hand side) to Unicode,
5513 ie. we can't handle the comparison request. However, it is
5514 possible that the other object knows a comparison method, which
5515 is why we return Py_NotImplemented to give the other object a
5516 chance.
5517
5518 */
5519 if (PyErr_ExceptionMatches(PyExc_TypeError)) {
5520 PyErr_Clear();
5521 Py_INCREF(Py_NotImplemented);
5522 return Py_NotImplemented;
5523 }
5524 if (op != Py_EQ && op != Py_NE)
5525 return NULL;
5526
5527 /* Equality comparison.
5528
5529 This is a special case: we silence any PyExc_UnicodeDecodeError
5530 and instead turn it into a PyErr_UnicodeWarning.
5531
5532 */
5533 if (!PyErr_ExceptionMatches(PyExc_UnicodeDecodeError))
5534 return NULL;
5535 PyErr_Clear();
5536 if (PyErr_Warn(PyExc_UnicodeWarning,
5537 (op == Py_EQ) ?
5538 "Unicode equal comparison "
5539 "failed to convert both arguments to Unicode - "
5540 "interpreting them as being unequal" :
5541 "Unicode unequal comparison "
5542 "failed to convert both arguments to Unicode - "
5543 "interpreting them as being unequal"
5544 ) < 0)
5545 return NULL;
5546 result = (op == Py_NE);
5547 return PyBool_FromLong(result);
5548}
5549
Guido van Rossum403d68b2000-03-13 15:55:09 +00005550int PyUnicode_Contains(PyObject *container,
5551 PyObject *element)
5552{
Thomas Wouters477c8d52006-05-27 19:21:47 +00005553 PyObject *str, *sub;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005554 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00005555
5556 /* Coerce the two arguments */
Thomas Wouters477c8d52006-05-27 19:21:47 +00005557 sub = PyUnicode_FromObject(element);
5558 if (!sub) {
Marc-André Lemburg7c014682000-06-28 08:11:47 +00005559 PyErr_SetString(PyExc_TypeError,
Barry Warsaw817918c2002-08-06 16:58:21 +00005560 "'in <string>' requires string as left operand");
Thomas Wouters477c8d52006-05-27 19:21:47 +00005561 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +00005562 }
5563
Thomas Wouters477c8d52006-05-27 19:21:47 +00005564 str = PyUnicode_FromObject(container);
5565 if (!str) {
5566 Py_DECREF(sub);
5567 return -1;
5568 }
5569
5570 result = stringlib_contains_obj(str, sub);
5571
5572 Py_DECREF(str);
5573 Py_DECREF(sub);
5574
Guido van Rossum403d68b2000-03-13 15:55:09 +00005575 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00005576}
5577
Guido van Rossumd57fd912000-03-10 22:53:23 +00005578/* Concat to string or Unicode object giving a new Unicode object. */
5579
5580PyObject *PyUnicode_Concat(PyObject *left,
5581 PyObject *right)
5582{
5583 PyUnicodeObject *u = NULL, *v = NULL, *w;
5584
Guido van Rossum84d79dd2007-04-13 02:23:57 +00005585 if (PyBytes_Check(left) || PyBytes_Check(right))
5586 return PyBytes_Concat(left, right);
5587
Guido van Rossumd57fd912000-03-10 22:53:23 +00005588 /* Coerce the two arguments */
5589 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
5590 if (u == NULL)
5591 goto onError;
5592 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
5593 if (v == NULL)
5594 goto onError;
5595
5596 /* Shortcuts */
5597 if (v == unicode_empty) {
5598 Py_DECREF(v);
5599 return (PyObject *)u;
5600 }
5601 if (u == unicode_empty) {
5602 Py_DECREF(u);
5603 return (PyObject *)v;
5604 }
5605
5606 /* Concat the two Unicode strings */
5607 w = _PyUnicode_New(u->length + v->length);
5608 if (w == NULL)
5609 goto onError;
5610 Py_UNICODE_COPY(w->str, u->str, u->length);
5611 Py_UNICODE_COPY(w->str + u->length, v->str, v->length);
5612
5613 Py_DECREF(u);
5614 Py_DECREF(v);
5615 return (PyObject *)w;
5616
5617onError:
5618 Py_XDECREF(u);
5619 Py_XDECREF(v);
5620 return NULL;
5621}
5622
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005623PyDoc_STRVAR(count__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005624"S.count(sub[, start[, end]]) -> int\n\
5625\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00005626Return the number of non-overlapping occurrences of substring sub in\n\
5627Unicode string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005628interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005629
5630static PyObject *
5631unicode_count(PyUnicodeObject *self, PyObject *args)
5632{
5633 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005634 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005635 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005636 PyObject *result;
5637
Guido van Rossumb8872e62000-05-09 14:14:27 +00005638 if (!PyArg_ParseTuple(args, "O|O&O&:count", &substring,
5639 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005640 return NULL;
5641
5642 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Thomas Wouters477c8d52006-05-27 19:21:47 +00005643 (PyObject *)substring);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005644 if (substring == NULL)
5645 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +00005646
Thomas Wouters477c8d52006-05-27 19:21:47 +00005647 FIX_START_END(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005648
Thomas Wouters477c8d52006-05-27 19:21:47 +00005649 result = PyInt_FromSsize_t(
5650 stringlib_count(self->str + start, end - start,
5651 substring->str, substring->length)
5652 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00005653
5654 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00005655
Guido van Rossumd57fd912000-03-10 22:53:23 +00005656 return result;
5657}
5658
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005659PyDoc_STRVAR(encode__doc__,
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00005660"S.encode([encoding[,errors]]) -> string or unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005661\n\
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00005662Encodes S using the codec registered for encoding. encoding defaults\n\
5663to the default encoding. errors may be given to set a different error\n\
Fred Drakee4315f52000-05-09 19:53:39 +00005664handling scheme. Default is 'strict' meaning that encoding errors raise\n\
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005665a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
5666'xmlcharrefreplace' as well as any other name registered with\n\
5667codecs.register_error that can handle UnicodeEncodeErrors.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005668
5669static PyObject *
5670unicode_encode(PyUnicodeObject *self, PyObject *args)
5671{
5672 char *encoding = NULL;
5673 char *errors = NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00005674 PyObject *v;
5675
Guido van Rossumd57fd912000-03-10 22:53:23 +00005676 if (!PyArg_ParseTuple(args, "|ss:encode", &encoding, &errors))
5677 return NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00005678 v = PyUnicode_AsEncodedObject((PyObject *)self, encoding, errors);
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00005679 if (v == NULL)
5680 goto onError;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00005681 if (!PyBytes_Check(v)) {
Guido van Rossum4355a472007-05-04 05:00:04 +00005682 if (PyString_Check(v)) {
5683 /* Old codec, turn it into bytes */
5684 PyObject *b = PyBytes_FromObject(v);
5685 Py_DECREF(v);
5686 return b;
5687 }
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00005688 PyErr_Format(PyExc_TypeError,
Guido van Rossumf15a29f2007-05-04 00:41:39 +00005689 "encoder did not return a bytes object "
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00005690 "(type=%.400s)",
5691 v->ob_type->tp_name);
5692 Py_DECREF(v);
5693 return NULL;
5694 }
5695 return v;
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00005696
5697 onError:
5698 return NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00005699}
5700
5701PyDoc_STRVAR(decode__doc__,
5702"S.decode([encoding[,errors]]) -> string or unicode\n\
5703\n\
5704Decodes S using the codec registered for encoding. encoding defaults\n\
5705to the default encoding. errors may be given to set a different error\n\
5706handling scheme. Default is 'strict' meaning that encoding errors raise\n\
5707a UnicodeDecodeError. Other possible values are 'ignore' and 'replace'\n\
5708as well as any other name registerd with codecs.register_error that is\n\
5709able to handle UnicodeDecodeErrors.");
5710
5711static PyObject *
Marc-André Lemburg126b44c2004-07-10 12:04:20 +00005712unicode_decode(PyUnicodeObject *self, PyObject *args)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00005713{
5714 char *encoding = NULL;
5715 char *errors = NULL;
5716 PyObject *v;
5717
5718 if (!PyArg_ParseTuple(args, "|ss:decode", &encoding, &errors))
5719 return NULL;
5720 v = PyUnicode_AsDecodedObject((PyObject *)self, encoding, errors);
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00005721 if (v == NULL)
5722 goto onError;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00005723 if (!PyString_Check(v) && !PyUnicode_Check(v)) {
5724 PyErr_Format(PyExc_TypeError,
5725 "decoder did not return a string/unicode object "
5726 "(type=%.400s)",
5727 v->ob_type->tp_name);
5728 Py_DECREF(v);
5729 return NULL;
5730 }
5731 return v;
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00005732
5733 onError:
5734 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005735}
5736
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005737PyDoc_STRVAR(expandtabs__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005738"S.expandtabs([tabsize]) -> unicode\n\
5739\n\
5740Return a copy of S where all tab characters are expanded using spaces.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005741If tabsize is not given, a tab size of 8 characters is assumed.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005742
5743static PyObject*
5744unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
5745{
5746 Py_UNICODE *e;
5747 Py_UNICODE *p;
5748 Py_UNICODE *q;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005749 Py_ssize_t i, j;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005750 PyUnicodeObject *u;
5751 int tabsize = 8;
5752
5753 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
5754 return NULL;
5755
Thomas Wouters7e474022000-07-16 12:04:32 +00005756 /* First pass: determine size of output string */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005757 i = j = 0;
5758 e = self->str + self->length;
5759 for (p = self->str; p < e; p++)
5760 if (*p == '\t') {
5761 if (tabsize > 0)
5762 j += tabsize - (j % tabsize);
5763 }
5764 else {
5765 j++;
5766 if (*p == '\n' || *p == '\r') {
5767 i += j;
5768 j = 0;
5769 }
5770 }
5771
5772 /* Second pass: create output string and fill it */
5773 u = _PyUnicode_New(i + j);
5774 if (!u)
5775 return NULL;
5776
5777 j = 0;
5778 q = u->str;
5779
5780 for (p = self->str; p < e; p++)
5781 if (*p == '\t') {
5782 if (tabsize > 0) {
5783 i = tabsize - (j % tabsize);
5784 j += i;
5785 while (i--)
5786 *q++ = ' ';
5787 }
5788 }
5789 else {
5790 j++;
5791 *q++ = *p;
5792 if (*p == '\n' || *p == '\r')
5793 j = 0;
5794 }
5795
5796 return (PyObject*) u;
5797}
5798
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005799PyDoc_STRVAR(find__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005800"S.find(sub [,start [,end]]) -> int\n\
5801\n\
5802Return the lowest index in S where substring sub is found,\n\
5803such that sub is contained within s[start,end]. Optional\n\
5804arguments start and end are interpreted as in slice notation.\n\
5805\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005806Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005807
5808static PyObject *
5809unicode_find(PyUnicodeObject *self, PyObject *args)
5810{
Thomas Wouters477c8d52006-05-27 19:21:47 +00005811 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005812 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005813 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005814 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005815
Guido van Rossumb8872e62000-05-09 14:14:27 +00005816 if (!PyArg_ParseTuple(args, "O|O&O&:find", &substring,
5817 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005818 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005819 substring = PyUnicode_FromObject(substring);
5820 if (!substring)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005821 return NULL;
5822
Thomas Wouters477c8d52006-05-27 19:21:47 +00005823 result = stringlib_find_slice(
5824 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
5825 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
5826 start, end
5827 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00005828
5829 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00005830
5831 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005832}
5833
5834static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +00005835unicode_getitem(PyUnicodeObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005836{
5837 if (index < 0 || index >= self->length) {
5838 PyErr_SetString(PyExc_IndexError, "string index out of range");
5839 return NULL;
5840 }
5841
5842 return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1);
5843}
5844
5845static long
Guido van Rossum09dc34f2007-05-04 04:17:33 +00005846unicode_hash(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005847{
Guido van Rossum09dc34f2007-05-04 04:17:33 +00005848 /* Since Unicode objects compare equal to their UTF-8 string
5849 counterparts, we hash the UTF-8 string. */
5850 PyObject *v = _PyUnicode_AsDefaultEncodedString(self, NULL);
5851 return PyObject_Hash(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005852}
5853
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005854PyDoc_STRVAR(index__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005855"S.index(sub [,start [,end]]) -> int\n\
5856\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005857Like S.find() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005858
5859static PyObject *
5860unicode_index(PyUnicodeObject *self, PyObject *args)
5861{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005862 Py_ssize_t result;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005863 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005864 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005865 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005866
Guido van Rossumb8872e62000-05-09 14:14:27 +00005867 if (!PyArg_ParseTuple(args, "O|O&O&:index", &substring,
5868 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005869 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005870 substring = PyUnicode_FromObject(substring);
5871 if (!substring)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005872 return NULL;
5873
Thomas Wouters477c8d52006-05-27 19:21:47 +00005874 result = stringlib_find_slice(
5875 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
5876 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
5877 start, end
5878 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00005879
5880 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00005881
Guido van Rossumd57fd912000-03-10 22:53:23 +00005882 if (result < 0) {
5883 PyErr_SetString(PyExc_ValueError, "substring not found");
5884 return NULL;
5885 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00005886
Martin v. Löwis18e16552006-02-15 17:27:45 +00005887 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005888}
5889
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005890PyDoc_STRVAR(islower__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005891"S.islower() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005892\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00005893Return True if all cased characters in S are lowercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005894at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005895
5896static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005897unicode_islower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005898{
5899 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5900 register const Py_UNICODE *e;
5901 int cased;
5902
Guido van Rossumd57fd912000-03-10 22:53:23 +00005903 /* Shortcut for single character strings */
5904 if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005905 return PyBool_FromLong(Py_UNICODE_ISLOWER(*p));
Guido van Rossumd57fd912000-03-10 22:53:23 +00005906
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005907 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00005908 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005909 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005910
Guido van Rossumd57fd912000-03-10 22:53:23 +00005911 e = p + PyUnicode_GET_SIZE(self);
5912 cased = 0;
5913 for (; p < e; p++) {
5914 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00005915
Guido van Rossumd57fd912000-03-10 22:53:23 +00005916 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005917 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005918 else if (!cased && Py_UNICODE_ISLOWER(ch))
5919 cased = 1;
5920 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00005921 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005922}
5923
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005924PyDoc_STRVAR(isupper__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005925"S.isupper() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005926\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00005927Return True if all cased characters in S are uppercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005928at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005929
5930static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005931unicode_isupper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005932{
5933 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5934 register const Py_UNICODE *e;
5935 int cased;
5936
Guido van Rossumd57fd912000-03-10 22:53:23 +00005937 /* Shortcut for single character strings */
5938 if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005939 return PyBool_FromLong(Py_UNICODE_ISUPPER(*p) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005940
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005941 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00005942 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005943 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005944
Guido van Rossumd57fd912000-03-10 22:53:23 +00005945 e = p + PyUnicode_GET_SIZE(self);
5946 cased = 0;
5947 for (; p < e; p++) {
5948 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00005949
Guido van Rossumd57fd912000-03-10 22:53:23 +00005950 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005951 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005952 else if (!cased && Py_UNICODE_ISUPPER(ch))
5953 cased = 1;
5954 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00005955 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005956}
5957
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005958PyDoc_STRVAR(istitle__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005959"S.istitle() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005960\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00005961Return True if S is a titlecased string and there is at least one\n\
5962character in S, i.e. upper- and titlecase characters may only\n\
5963follow uncased characters and lowercase characters only cased ones.\n\
5964Return False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005965
5966static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005967unicode_istitle(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005968{
5969 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5970 register const Py_UNICODE *e;
5971 int cased, previous_is_cased;
5972
Guido van Rossumd57fd912000-03-10 22:53:23 +00005973 /* Shortcut for single character strings */
5974 if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005975 return PyBool_FromLong((Py_UNICODE_ISTITLE(*p) != 0) ||
5976 (Py_UNICODE_ISUPPER(*p) != 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +00005977
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005978 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00005979 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005980 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005981
Guido van Rossumd57fd912000-03-10 22:53:23 +00005982 e = p + PyUnicode_GET_SIZE(self);
5983 cased = 0;
5984 previous_is_cased = 0;
5985 for (; p < e; p++) {
5986 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00005987
Guido van Rossumd57fd912000-03-10 22:53:23 +00005988 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
5989 if (previous_is_cased)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005990 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005991 previous_is_cased = 1;
5992 cased = 1;
5993 }
5994 else if (Py_UNICODE_ISLOWER(ch)) {
5995 if (!previous_is_cased)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005996 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005997 previous_is_cased = 1;
5998 cased = 1;
5999 }
6000 else
6001 previous_is_cased = 0;
6002 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006003 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006004}
6005
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006006PyDoc_STRVAR(isspace__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006007"S.isspace() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006008\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006009Return True if all characters in S are whitespace\n\
6010and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006011
6012static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006013unicode_isspace(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006014{
6015 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6016 register const Py_UNICODE *e;
6017
Guido van Rossumd57fd912000-03-10 22:53:23 +00006018 /* Shortcut for single character strings */
6019 if (PyUnicode_GET_SIZE(self) == 1 &&
6020 Py_UNICODE_ISSPACE(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006021 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006022
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006023 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006024 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006025 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006026
Guido van Rossumd57fd912000-03-10 22:53:23 +00006027 e = p + PyUnicode_GET_SIZE(self);
6028 for (; p < e; p++) {
6029 if (!Py_UNICODE_ISSPACE(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006030 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006031 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006032 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006033}
6034
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006035PyDoc_STRVAR(isalpha__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006036"S.isalpha() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006037\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006038Return True if all characters in S are alphabetic\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006039and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006040
6041static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006042unicode_isalpha(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006043{
6044 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6045 register const Py_UNICODE *e;
6046
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006047 /* Shortcut for single character strings */
6048 if (PyUnicode_GET_SIZE(self) == 1 &&
6049 Py_UNICODE_ISALPHA(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006050 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006051
6052 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006053 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006054 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006055
6056 e = p + PyUnicode_GET_SIZE(self);
6057 for (; p < e; p++) {
6058 if (!Py_UNICODE_ISALPHA(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006059 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006060 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006061 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006062}
6063
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006064PyDoc_STRVAR(isalnum__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006065"S.isalnum() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006066\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006067Return True if all characters in S are alphanumeric\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006068and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006069
6070static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006071unicode_isalnum(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006072{
6073 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6074 register const Py_UNICODE *e;
6075
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006076 /* Shortcut for single character strings */
6077 if (PyUnicode_GET_SIZE(self) == 1 &&
6078 Py_UNICODE_ISALNUM(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006079 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006080
6081 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006082 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006083 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006084
6085 e = p + PyUnicode_GET_SIZE(self);
6086 for (; p < e; p++) {
6087 if (!Py_UNICODE_ISALNUM(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006088 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006089 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006090 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006091}
6092
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006093PyDoc_STRVAR(isdecimal__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006094"S.isdecimal() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006095\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00006096Return True if there are only decimal characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006097False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006098
6099static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006100unicode_isdecimal(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006101{
6102 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6103 register const Py_UNICODE *e;
6104
Guido van Rossumd57fd912000-03-10 22:53:23 +00006105 /* Shortcut for single character strings */
6106 if (PyUnicode_GET_SIZE(self) == 1 &&
6107 Py_UNICODE_ISDECIMAL(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006108 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006109
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006110 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006111 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006112 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006113
Guido van Rossumd57fd912000-03-10 22:53:23 +00006114 e = p + PyUnicode_GET_SIZE(self);
6115 for (; p < e; p++) {
6116 if (!Py_UNICODE_ISDECIMAL(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006117 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006118 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006119 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006120}
6121
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006122PyDoc_STRVAR(isdigit__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006123"S.isdigit() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006124\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006125Return True if all characters in S are digits\n\
6126and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006127
6128static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006129unicode_isdigit(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006130{
6131 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6132 register const Py_UNICODE *e;
6133
Guido van Rossumd57fd912000-03-10 22:53:23 +00006134 /* Shortcut for single character strings */
6135 if (PyUnicode_GET_SIZE(self) == 1 &&
6136 Py_UNICODE_ISDIGIT(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006137 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006138
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006139 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006140 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006141 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006142
Guido van Rossumd57fd912000-03-10 22:53:23 +00006143 e = p + PyUnicode_GET_SIZE(self);
6144 for (; p < e; p++) {
6145 if (!Py_UNICODE_ISDIGIT(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006146 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006147 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006148 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006149}
6150
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006151PyDoc_STRVAR(isnumeric__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006152"S.isnumeric() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006153\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00006154Return True if there are only numeric characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006155False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006156
6157static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006158unicode_isnumeric(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006159{
6160 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6161 register const Py_UNICODE *e;
6162
Guido van Rossumd57fd912000-03-10 22:53:23 +00006163 /* Shortcut for single character strings */
6164 if (PyUnicode_GET_SIZE(self) == 1 &&
6165 Py_UNICODE_ISNUMERIC(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006166 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006167
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006168 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006169 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006170 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006171
Guido van Rossumd57fd912000-03-10 22:53:23 +00006172 e = p + PyUnicode_GET_SIZE(self);
6173 for (; p < e; p++) {
6174 if (!Py_UNICODE_ISNUMERIC(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006175 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006176 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006177 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006178}
6179
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006180PyDoc_STRVAR(join__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006181"S.join(sequence) -> unicode\n\
6182\n\
6183Return a string which is the concatenation of the strings in the\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006184sequence. The separator between elements is S.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006185
6186static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006187unicode_join(PyObject *self, PyObject *data)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006188{
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006189 return PyUnicode_Join(self, data);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006190}
6191
Martin v. Löwis18e16552006-02-15 17:27:45 +00006192static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +00006193unicode_length(PyUnicodeObject *self)
6194{
6195 return self->length;
6196}
6197
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006198PyDoc_STRVAR(ljust__doc__,
Hye-Shik Chang974ed7c2004-06-02 16:49:17 +00006199"S.ljust(width[, fillchar]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006200\n\
6201Return S left justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006202done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006203
6204static PyObject *
6205unicode_ljust(PyUnicodeObject *self, PyObject *args)
6206{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006207 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006208 Py_UNICODE fillchar = ' ';
6209
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006210 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006211 return NULL;
6212
Tim Peters7a29bd52001-09-12 03:03:31 +00006213 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006214 Py_INCREF(self);
6215 return (PyObject*) self;
6216 }
6217
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006218 return (PyObject*) pad(self, 0, width - self->length, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006219}
6220
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006221PyDoc_STRVAR(lower__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006222"S.lower() -> unicode\n\
6223\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006224Return a copy of the string S converted to lowercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006225
6226static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006227unicode_lower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006228{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006229 return fixup(self, fixlower);
6230}
6231
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006232#define LEFTSTRIP 0
6233#define RIGHTSTRIP 1
6234#define BOTHSTRIP 2
6235
6236/* Arrays indexed by above */
6237static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
6238
6239#define STRIPNAME(i) (stripformat[i]+3)
6240
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006241/* externally visible for str.strip(unicode) */
6242PyObject *
6243_PyUnicode_XStrip(PyUnicodeObject *self, int striptype, PyObject *sepobj)
6244{
6245 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
Martin v. Löwis18e16552006-02-15 17:27:45 +00006246 Py_ssize_t len = PyUnicode_GET_SIZE(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006247 Py_UNICODE *sep = PyUnicode_AS_UNICODE(sepobj);
Martin v. Löwis18e16552006-02-15 17:27:45 +00006248 Py_ssize_t seplen = PyUnicode_GET_SIZE(sepobj);
6249 Py_ssize_t i, j;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006250
Thomas Wouters477c8d52006-05-27 19:21:47 +00006251 BLOOM_MASK sepmask = make_bloom_mask(sep, seplen);
6252
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006253 i = 0;
6254 if (striptype != RIGHTSTRIP) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00006255 while (i < len && BLOOM_MEMBER(sepmask, s[i], sep, seplen)) {
6256 i++;
6257 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006258 }
6259
6260 j = len;
6261 if (striptype != LEFTSTRIP) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00006262 do {
6263 j--;
6264 } while (j >= i && BLOOM_MEMBER(sepmask, s[j], sep, seplen));
6265 j++;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006266 }
6267
6268 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00006269 Py_INCREF(self);
6270 return (PyObject*)self;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006271 }
6272 else
Thomas Wouters477c8d52006-05-27 19:21:47 +00006273 return PyUnicode_FromUnicode(s+i, j-i);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006274}
6275
Guido van Rossumd57fd912000-03-10 22:53:23 +00006276
6277static PyObject *
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006278do_strip(PyUnicodeObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006279{
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006280 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
Martin v. Löwis18e16552006-02-15 17:27:45 +00006281 Py_ssize_t len = PyUnicode_GET_SIZE(self), i, j;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006282
6283 i = 0;
6284 if (striptype != RIGHTSTRIP) {
6285 while (i < len && Py_UNICODE_ISSPACE(s[i])) {
6286 i++;
6287 }
6288 }
6289
6290 j = len;
6291 if (striptype != LEFTSTRIP) {
6292 do {
6293 j--;
6294 } while (j >= i && Py_UNICODE_ISSPACE(s[j]));
6295 j++;
6296 }
6297
6298 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
6299 Py_INCREF(self);
6300 return (PyObject*)self;
6301 }
6302 else
6303 return PyUnicode_FromUnicode(s+i, j-i);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006304}
6305
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006306
6307static PyObject *
6308do_argstrip(PyUnicodeObject *self, int striptype, PyObject *args)
6309{
6310 PyObject *sep = NULL;
6311
6312 if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
6313 return NULL;
6314
6315 if (sep != NULL && sep != Py_None) {
6316 if (PyUnicode_Check(sep))
6317 return _PyUnicode_XStrip(self, striptype, sep);
6318 else if (PyString_Check(sep)) {
6319 PyObject *res;
6320 sep = PyUnicode_FromObject(sep);
6321 if (sep==NULL)
6322 return NULL;
6323 res = _PyUnicode_XStrip(self, striptype, sep);
6324 Py_DECREF(sep);
6325 return res;
6326 }
6327 else {
6328 PyErr_Format(PyExc_TypeError,
6329 "%s arg must be None, unicode or str",
6330 STRIPNAME(striptype));
6331 return NULL;
6332 }
6333 }
6334
6335 return do_strip(self, striptype);
6336}
6337
6338
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006339PyDoc_STRVAR(strip__doc__,
Neal Norwitzffe33b72003-04-10 22:35:32 +00006340"S.strip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006341\n\
6342Return a copy of the string S with leading and trailing\n\
6343whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00006344If chars is given and not None, remove characters in chars instead.\n\
6345If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006346
6347static PyObject *
6348unicode_strip(PyUnicodeObject *self, PyObject *args)
6349{
6350 if (PyTuple_GET_SIZE(args) == 0)
6351 return do_strip(self, BOTHSTRIP); /* Common case */
6352 else
6353 return do_argstrip(self, BOTHSTRIP, args);
6354}
6355
6356
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006357PyDoc_STRVAR(lstrip__doc__,
Neal Norwitzffe33b72003-04-10 22:35:32 +00006358"S.lstrip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006359\n\
6360Return a copy of the string S with leading whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00006361If chars is given and not None, remove characters in chars instead.\n\
6362If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006363
6364static PyObject *
6365unicode_lstrip(PyUnicodeObject *self, PyObject *args)
6366{
6367 if (PyTuple_GET_SIZE(args) == 0)
6368 return do_strip(self, LEFTSTRIP); /* Common case */
6369 else
6370 return do_argstrip(self, LEFTSTRIP, args);
6371}
6372
6373
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006374PyDoc_STRVAR(rstrip__doc__,
Neal Norwitzffe33b72003-04-10 22:35:32 +00006375"S.rstrip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006376\n\
6377Return a copy of the string S with trailing whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00006378If chars is given and not None, remove characters in chars instead.\n\
6379If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006380
6381static PyObject *
6382unicode_rstrip(PyUnicodeObject *self, PyObject *args)
6383{
6384 if (PyTuple_GET_SIZE(args) == 0)
6385 return do_strip(self, RIGHTSTRIP); /* Common case */
6386 else
6387 return do_argstrip(self, RIGHTSTRIP, args);
6388}
6389
6390
Guido van Rossumd57fd912000-03-10 22:53:23 +00006391static PyObject*
Martin v. Löwis18e16552006-02-15 17:27:45 +00006392unicode_repeat(PyUnicodeObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006393{
6394 PyUnicodeObject *u;
6395 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006396 Py_ssize_t nchars;
Tim Peters8f422462000-09-09 06:13:41 +00006397 size_t nbytes;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006398
6399 if (len < 0)
6400 len = 0;
6401
Tim Peters7a29bd52001-09-12 03:03:31 +00006402 if (len == 1 && PyUnicode_CheckExact(str)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006403 /* no repeat, return original string */
6404 Py_INCREF(str);
6405 return (PyObject*) str;
6406 }
Tim Peters8f422462000-09-09 06:13:41 +00006407
6408 /* ensure # of chars needed doesn't overflow int and # of bytes
6409 * needed doesn't overflow size_t
6410 */
6411 nchars = len * str->length;
6412 if (len && nchars / len != str->length) {
6413 PyErr_SetString(PyExc_OverflowError,
6414 "repeated string is too long");
6415 return NULL;
6416 }
6417 nbytes = (nchars + 1) * sizeof(Py_UNICODE);
6418 if (nbytes / sizeof(Py_UNICODE) != (size_t)(nchars + 1)) {
6419 PyErr_SetString(PyExc_OverflowError,
6420 "repeated string is too long");
6421 return NULL;
6422 }
6423 u = _PyUnicode_New(nchars);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006424 if (!u)
6425 return NULL;
6426
6427 p = u->str;
6428
Thomas Wouters477c8d52006-05-27 19:21:47 +00006429 if (str->length == 1 && len > 0) {
6430 Py_UNICODE_FILL(p, str->str[0], len);
6431 } else {
6432 Py_ssize_t done = 0; /* number of characters copied this far */
6433 if (done < nchars) {
6434 Py_UNICODE_COPY(p, str->str, str->length);
6435 done = str->length;
6436 }
6437 while (done < nchars) {
6438 int n = (done <= nchars-done) ? done : nchars-done;
6439 Py_UNICODE_COPY(p+done, p, n);
6440 done += n;
6441 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006442 }
6443
6444 return (PyObject*) u;
6445}
6446
6447PyObject *PyUnicode_Replace(PyObject *obj,
6448 PyObject *subobj,
6449 PyObject *replobj,
Martin v. Löwis18e16552006-02-15 17:27:45 +00006450 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006451{
6452 PyObject *self;
6453 PyObject *str1;
6454 PyObject *str2;
6455 PyObject *result;
6456
6457 self = PyUnicode_FromObject(obj);
6458 if (self == NULL)
6459 return NULL;
6460 str1 = PyUnicode_FromObject(subobj);
6461 if (str1 == NULL) {
6462 Py_DECREF(self);
6463 return NULL;
6464 }
6465 str2 = PyUnicode_FromObject(replobj);
6466 if (str2 == NULL) {
6467 Py_DECREF(self);
6468 Py_DECREF(str1);
6469 return NULL;
6470 }
Tim Petersced69f82003-09-16 20:30:58 +00006471 result = replace((PyUnicodeObject *)self,
6472 (PyUnicodeObject *)str1,
6473 (PyUnicodeObject *)str2,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006474 maxcount);
6475 Py_DECREF(self);
6476 Py_DECREF(str1);
6477 Py_DECREF(str2);
6478 return result;
6479}
6480
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006481PyDoc_STRVAR(replace__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006482"S.replace (old, new[, maxsplit]) -> unicode\n\
6483\n\
6484Return a copy of S with all occurrences of substring\n\
6485old replaced by new. If the optional argument maxsplit is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006486given, only the first maxsplit occurrences are replaced.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006487
6488static PyObject*
6489unicode_replace(PyUnicodeObject *self, PyObject *args)
6490{
6491 PyUnicodeObject *str1;
6492 PyUnicodeObject *str2;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006493 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006494 PyObject *result;
6495
Martin v. Löwis18e16552006-02-15 17:27:45 +00006496 if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006497 return NULL;
6498 str1 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str1);
6499 if (str1 == NULL)
6500 return NULL;
6501 str2 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str2);
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00006502 if (str2 == NULL) {
6503 Py_DECREF(str1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006504 return NULL;
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00006505 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006506
6507 result = replace(self, str1, str2, maxcount);
6508
6509 Py_DECREF(str1);
6510 Py_DECREF(str2);
6511 return result;
6512}
6513
6514static
6515PyObject *unicode_repr(PyObject *unicode)
6516{
6517 return unicodeescape_string(PyUnicode_AS_UNICODE(unicode),
6518 PyUnicode_GET_SIZE(unicode),
6519 1);
6520}
6521
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006522PyDoc_STRVAR(rfind__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006523"S.rfind(sub [,start [,end]]) -> int\n\
6524\n\
6525Return the highest index in S where substring sub is found,\n\
6526such that sub is contained within s[start,end]. Optional\n\
6527arguments start and end are interpreted as in slice notation.\n\
6528\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006529Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006530
6531static PyObject *
6532unicode_rfind(PyUnicodeObject *self, PyObject *args)
6533{
Thomas Wouters477c8d52006-05-27 19:21:47 +00006534 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006535 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006536 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006537 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006538
Guido van Rossumb8872e62000-05-09 14:14:27 +00006539 if (!PyArg_ParseTuple(args, "O|O&O&:rfind", &substring,
6540 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006541 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006542 substring = PyUnicode_FromObject(substring);
6543 if (!substring)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006544 return NULL;
6545
Thomas Wouters477c8d52006-05-27 19:21:47 +00006546 result = stringlib_rfind_slice(
6547 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
6548 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
6549 start, end
6550 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006551
6552 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00006553
6554 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006555}
6556
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006557PyDoc_STRVAR(rindex__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006558"S.rindex(sub [,start [,end]]) -> int\n\
6559\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006560Like S.rfind() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006561
6562static PyObject *
6563unicode_rindex(PyUnicodeObject *self, PyObject *args)
6564{
Thomas Wouters477c8d52006-05-27 19:21:47 +00006565 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006566 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006567 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006568 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006569
Guido van Rossumb8872e62000-05-09 14:14:27 +00006570 if (!PyArg_ParseTuple(args, "O|O&O&:rindex", &substring,
6571 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006572 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006573 substring = PyUnicode_FromObject(substring);
6574 if (!substring)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006575 return NULL;
6576
Thomas Wouters477c8d52006-05-27 19:21:47 +00006577 result = stringlib_rfind_slice(
6578 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
6579 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
6580 start, end
6581 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006582
6583 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00006584
Guido van Rossumd57fd912000-03-10 22:53:23 +00006585 if (result < 0) {
6586 PyErr_SetString(PyExc_ValueError, "substring not found");
6587 return NULL;
6588 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00006589 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006590}
6591
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006592PyDoc_STRVAR(rjust__doc__,
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006593"S.rjust(width[, fillchar]) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006594\n\
6595Return S right justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006596done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006597
6598static PyObject *
6599unicode_rjust(PyUnicodeObject *self, PyObject *args)
6600{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006601 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006602 Py_UNICODE fillchar = ' ';
6603
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006604 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006605 return NULL;
6606
Tim Peters7a29bd52001-09-12 03:03:31 +00006607 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006608 Py_INCREF(self);
6609 return (PyObject*) self;
6610 }
6611
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006612 return (PyObject*) pad(self, width - self->length, 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006613}
6614
Guido van Rossumd57fd912000-03-10 22:53:23 +00006615static PyObject*
Martin v. Löwis18e16552006-02-15 17:27:45 +00006616unicode_slice(PyUnicodeObject *self, Py_ssize_t start, Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006617{
6618 /* standard clamping */
6619 if (start < 0)
6620 start = 0;
6621 if (end < 0)
6622 end = 0;
6623 if (end > self->length)
6624 end = self->length;
Tim Peters7a29bd52001-09-12 03:03:31 +00006625 if (start == 0 && end == self->length && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006626 /* full slice, return original string */
6627 Py_INCREF(self);
6628 return (PyObject*) self;
6629 }
6630 if (start > end)
6631 start = end;
6632 /* copy slice */
6633 return (PyObject*) PyUnicode_FromUnicode(self->str + start,
6634 end - start);
6635}
6636
6637PyObject *PyUnicode_Split(PyObject *s,
6638 PyObject *sep,
Martin v. Löwis18e16552006-02-15 17:27:45 +00006639 Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006640{
6641 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00006642
Guido van Rossumd57fd912000-03-10 22:53:23 +00006643 s = PyUnicode_FromObject(s);
6644 if (s == NULL)
6645 return NULL;
6646 if (sep != NULL) {
6647 sep = PyUnicode_FromObject(sep);
6648 if (sep == NULL) {
6649 Py_DECREF(s);
6650 return NULL;
6651 }
6652 }
6653
6654 result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
6655
6656 Py_DECREF(s);
6657 Py_XDECREF(sep);
6658 return result;
6659}
6660
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006661PyDoc_STRVAR(split__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006662"S.split([sep [,maxsplit]]) -> list of strings\n\
6663\n\
6664Return a list of the words in S, using sep as the\n\
6665delimiter string. If maxsplit is given, at most maxsplit\n\
Thomas Hellerca0d2cb2004-09-15 11:41:32 +00006666splits are done. If sep is not specified or is None,\n\
Walter Dörwald782afc52004-09-14 09:40:45 +00006667any whitespace string is a separator.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006668
6669static PyObject*
6670unicode_split(PyUnicodeObject *self, PyObject *args)
6671{
6672 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006673 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006674
Martin v. Löwis18e16552006-02-15 17:27:45 +00006675 if (!PyArg_ParseTuple(args, "|On:split", &substring, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006676 return NULL;
6677
6678 if (substring == Py_None)
6679 return split(self, NULL, maxcount);
6680 else if (PyUnicode_Check(substring))
6681 return split(self, (PyUnicodeObject *)substring, maxcount);
6682 else
6683 return PyUnicode_Split((PyObject *)self, substring, maxcount);
6684}
6685
Thomas Wouters477c8d52006-05-27 19:21:47 +00006686PyObject *
6687PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
6688{
6689 PyObject* str_obj;
6690 PyObject* sep_obj;
6691 PyObject* out;
6692
6693 str_obj = PyUnicode_FromObject(str_in);
6694 if (!str_obj)
6695 return NULL;
6696 sep_obj = PyUnicode_FromObject(sep_in);
6697 if (!sep_obj) {
6698 Py_DECREF(str_obj);
6699 return NULL;
6700 }
6701
6702 out = stringlib_partition(
6703 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
6704 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
6705 );
6706
6707 Py_DECREF(sep_obj);
6708 Py_DECREF(str_obj);
6709
6710 return out;
6711}
6712
6713
6714PyObject *
6715PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
6716{
6717 PyObject* str_obj;
6718 PyObject* sep_obj;
6719 PyObject* out;
6720
6721 str_obj = PyUnicode_FromObject(str_in);
6722 if (!str_obj)
6723 return NULL;
6724 sep_obj = PyUnicode_FromObject(sep_in);
6725 if (!sep_obj) {
6726 Py_DECREF(str_obj);
6727 return NULL;
6728 }
6729
6730 out = stringlib_rpartition(
6731 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
6732 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
6733 );
6734
6735 Py_DECREF(sep_obj);
6736 Py_DECREF(str_obj);
6737
6738 return out;
6739}
6740
6741PyDoc_STRVAR(partition__doc__,
6742"S.partition(sep) -> (head, sep, tail)\n\
6743\n\
6744Searches for the separator sep in S, and returns the part before it,\n\
6745the separator itself, and the part after it. If the separator is not\n\
6746found, returns S and two empty strings.");
6747
6748static PyObject*
6749unicode_partition(PyUnicodeObject *self, PyObject *separator)
6750{
6751 return PyUnicode_Partition((PyObject *)self, separator);
6752}
6753
6754PyDoc_STRVAR(rpartition__doc__,
Thomas Wouters89f507f2006-12-13 04:49:30 +00006755"S.rpartition(sep) -> (tail, sep, head)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00006756\n\
6757Searches for the separator sep in S, starting at the end of S, and returns\n\
6758the part before it, the separator itself, and the part after it. If the\n\
Thomas Wouters89f507f2006-12-13 04:49:30 +00006759separator is not found, returns two empty strings and S.");
Thomas Wouters477c8d52006-05-27 19:21:47 +00006760
6761static PyObject*
6762unicode_rpartition(PyUnicodeObject *self, PyObject *separator)
6763{
6764 return PyUnicode_RPartition((PyObject *)self, separator);
6765}
6766
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006767PyObject *PyUnicode_RSplit(PyObject *s,
6768 PyObject *sep,
Martin v. Löwis18e16552006-02-15 17:27:45 +00006769 Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006770{
6771 PyObject *result;
6772
6773 s = PyUnicode_FromObject(s);
6774 if (s == NULL)
6775 return NULL;
6776 if (sep != NULL) {
6777 sep = PyUnicode_FromObject(sep);
6778 if (sep == NULL) {
6779 Py_DECREF(s);
6780 return NULL;
6781 }
6782 }
6783
6784 result = rsplit((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
6785
6786 Py_DECREF(s);
6787 Py_XDECREF(sep);
6788 return result;
6789}
6790
6791PyDoc_STRVAR(rsplit__doc__,
6792"S.rsplit([sep [,maxsplit]]) -> list of strings\n\
6793\n\
6794Return a list of the words in S, using sep as the\n\
6795delimiter string, starting at the end of the string and\n\
6796working to the front. If maxsplit is given, at most maxsplit\n\
6797splits are done. If sep is not specified, any whitespace string\n\
6798is a separator.");
6799
6800static PyObject*
6801unicode_rsplit(PyUnicodeObject *self, PyObject *args)
6802{
6803 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006804 Py_ssize_t maxcount = -1;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006805
Martin v. Löwis18e16552006-02-15 17:27:45 +00006806 if (!PyArg_ParseTuple(args, "|On:rsplit", &substring, &maxcount))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006807 return NULL;
6808
6809 if (substring == Py_None)
6810 return rsplit(self, NULL, maxcount);
6811 else if (PyUnicode_Check(substring))
6812 return rsplit(self, (PyUnicodeObject *)substring, maxcount);
6813 else
6814 return PyUnicode_RSplit((PyObject *)self, substring, maxcount);
6815}
6816
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006817PyDoc_STRVAR(splitlines__doc__,
Guido van Rossum86662912000-04-11 15:38:46 +00006818"S.splitlines([keepends]]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006819\n\
6820Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +00006821Line breaks are not included in the resulting list unless keepends\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006822is given and true.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006823
6824static PyObject*
6825unicode_splitlines(PyUnicodeObject *self, PyObject *args)
6826{
Guido van Rossum86662912000-04-11 15:38:46 +00006827 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006828
Guido van Rossum86662912000-04-11 15:38:46 +00006829 if (!PyArg_ParseTuple(args, "|i:splitlines", &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006830 return NULL;
6831
Guido van Rossum86662912000-04-11 15:38:46 +00006832 return PyUnicode_Splitlines((PyObject *)self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006833}
6834
6835static
Guido van Rossumf15a29f2007-05-04 00:41:39 +00006836PyObject *unicode_str(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006837{
Guido van Rossumf15a29f2007-05-04 00:41:39 +00006838 PyObject *res = _PyUnicode_AsDefaultEncodedString(self, NULL);
6839 Py_XINCREF(res);
6840 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006841}
6842
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006843PyDoc_STRVAR(swapcase__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006844"S.swapcase() -> unicode\n\
6845\n\
6846Return a copy of S with uppercase characters converted to lowercase\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006847and vice versa.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006848
6849static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006850unicode_swapcase(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006851{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006852 return fixup(self, fixswapcase);
6853}
6854
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006855PyDoc_STRVAR(translate__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006856"S.translate(table) -> unicode\n\
6857\n\
6858Return a copy of the string S, where all characters have been mapped\n\
6859through the given translation table, which must be a mapping of\n\
Walter Dörwald5c1ee172002-09-04 20:31:32 +00006860Unicode ordinals to Unicode ordinals, Unicode strings or None.\n\
6861Unmapped characters are left untouched. Characters mapped to None\n\
6862are deleted.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006863
6864static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006865unicode_translate(PyUnicodeObject *self, PyObject *table)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006866{
Tim Petersced69f82003-09-16 20:30:58 +00006867 return PyUnicode_TranslateCharmap(self->str,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006868 self->length,
Tim Petersced69f82003-09-16 20:30:58 +00006869 table,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006870 "ignore");
6871}
6872
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006873PyDoc_STRVAR(upper__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006874"S.upper() -> unicode\n\
6875\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006876Return a copy of S converted to uppercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006877
6878static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006879unicode_upper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006880{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006881 return fixup(self, fixupper);
6882}
6883
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006884PyDoc_STRVAR(zfill__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006885"S.zfill(width) -> unicode\n\
6886\n\
6887Pad a numeric string x with zeros on the left, to fill a field\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006888of the specified width. The string x is never truncated.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006889
6890static PyObject *
6891unicode_zfill(PyUnicodeObject *self, PyObject *args)
6892{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006893 Py_ssize_t fill;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006894 PyUnicodeObject *u;
6895
Martin v. Löwis18e16552006-02-15 17:27:45 +00006896 Py_ssize_t width;
6897 if (!PyArg_ParseTuple(args, "n:zfill", &width))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006898 return NULL;
6899
6900 if (self->length >= width) {
Walter Dörwald0fe940c2002-04-15 18:42:15 +00006901 if (PyUnicode_CheckExact(self)) {
6902 Py_INCREF(self);
6903 return (PyObject*) self;
6904 }
6905 else
6906 return PyUnicode_FromUnicode(
6907 PyUnicode_AS_UNICODE(self),
6908 PyUnicode_GET_SIZE(self)
6909 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006910 }
6911
6912 fill = width - self->length;
6913
6914 u = pad(self, fill, 0, '0');
6915
Walter Dörwald068325e2002-04-15 13:36:47 +00006916 if (u == NULL)
6917 return NULL;
6918
Guido van Rossumd57fd912000-03-10 22:53:23 +00006919 if (u->str[fill] == '+' || u->str[fill] == '-') {
6920 /* move sign to beginning of string */
6921 u->str[0] = u->str[fill];
6922 u->str[fill] = '0';
6923 }
6924
6925 return (PyObject*) u;
6926}
Guido van Rossumd57fd912000-03-10 22:53:23 +00006927
6928#if 0
6929static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006930unicode_freelistsize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006931{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006932 return PyInt_FromLong(unicode_freelist_size);
6933}
6934#endif
6935
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006936PyDoc_STRVAR(startswith__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006937"S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006938\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00006939Return True if S starts with the specified prefix, False otherwise.\n\
6940With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006941With optional end, stop comparing S at that position.\n\
6942prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006943
6944static PyObject *
6945unicode_startswith(PyUnicodeObject *self,
6946 PyObject *args)
6947{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006948 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006949 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006950 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006951 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006952 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006953
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006954 if (!PyArg_ParseTuple(args, "O|O&O&:startswith", &subobj,
Guido van Rossumb8872e62000-05-09 14:14:27 +00006955 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006956 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006957 if (PyTuple_Check(subobj)) {
6958 Py_ssize_t i;
6959 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
6960 substring = (PyUnicodeObject *)PyUnicode_FromObject(
6961 PyTuple_GET_ITEM(subobj, i));
6962 if (substring == NULL)
6963 return NULL;
6964 result = tailmatch(self, substring, start, end, -1);
6965 Py_DECREF(substring);
6966 if (result) {
6967 Py_RETURN_TRUE;
6968 }
6969 }
6970 /* nothing matched */
6971 Py_RETURN_FALSE;
6972 }
6973 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006974 if (substring == NULL)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006975 return NULL;
6976 result = tailmatch(self, substring, start, end, -1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006977 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006978 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006979}
6980
6981
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006982PyDoc_STRVAR(endswith__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006983"S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006984\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00006985Return True if S ends with the specified suffix, False otherwise.\n\
6986With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006987With optional end, stop comparing S at that position.\n\
6988suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006989
6990static PyObject *
6991unicode_endswith(PyUnicodeObject *self,
6992 PyObject *args)
6993{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006994 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006995 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006996 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006997 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006998 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006999
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007000 if (!PyArg_ParseTuple(args, "O|O&O&:endswith", &subobj,
7001 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007002 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007003 if (PyTuple_Check(subobj)) {
7004 Py_ssize_t i;
7005 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
7006 substring = (PyUnicodeObject *)PyUnicode_FromObject(
7007 PyTuple_GET_ITEM(subobj, i));
7008 if (substring == NULL)
7009 return NULL;
7010 result = tailmatch(self, substring, start, end, +1);
7011 Py_DECREF(substring);
7012 if (result) {
7013 Py_RETURN_TRUE;
7014 }
7015 }
7016 Py_RETURN_FALSE;
7017 }
7018 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007019 if (substring == NULL)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007020 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007021
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007022 result = tailmatch(self, substring, start, end, +1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007023 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007024 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007025}
7026
7027
Guido van Rossum5d9113d2003-01-29 17:58:45 +00007028
7029static PyObject *
7030unicode_getnewargs(PyUnicodeObject *v)
7031{
7032 return Py_BuildValue("(u#)", v->str, v->length);
7033}
7034
7035
Guido van Rossumd57fd912000-03-10 22:53:23 +00007036static PyMethodDef unicode_methods[] = {
7037
7038 /* Order is according to common usage: often used methods should
7039 appear first, since lookup is done sequentially. */
7040
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007041 {"encode", (PyCFunction) unicode_encode, METH_VARARGS, encode__doc__},
7042 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
7043 {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007044 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007045 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
7046 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
7047 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
7048 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
7049 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
7050 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
7051 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +00007052 {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007053 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
7054 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
7055 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007056 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00007057 {"decode", (PyCFunction) unicode_decode, METH_VARARGS, decode__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007058/* {"maketrans", (PyCFunction) unicode_maketrans, METH_VARARGS, maketrans__doc__}, */
7059 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
7060 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
7061 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007062 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +00007063 {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007064 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS, splitlines__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007065 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007066 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
7067 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
7068 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
7069 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
7070 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
7071 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
7072 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
7073 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
7074 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
7075 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
7076 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
7077 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
7078 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
7079 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007080 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
Walter Dörwald068325e2002-04-15 13:36:47 +00007081#if 0
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007082 {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
Guido van Rossumd57fd912000-03-10 22:53:23 +00007083#endif
7084
7085#if 0
7086 /* This one is just used for debugging the implementation. */
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007087 {"freelistsize", (PyCFunction) unicode_freelistsize, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00007088#endif
7089
Guido van Rossum5d9113d2003-01-29 17:58:45 +00007090 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00007091 {NULL, NULL}
7092};
7093
Neil Schemenauerce30bc92002-11-18 16:10:18 +00007094static PyObject *
7095unicode_mod(PyObject *v, PyObject *w)
7096{
7097 if (!PyUnicode_Check(v)) {
7098 Py_INCREF(Py_NotImplemented);
7099 return Py_NotImplemented;
7100 }
7101 return PyUnicode_Format(v, w);
7102}
7103
7104static PyNumberMethods unicode_as_number = {
7105 0, /*nb_add*/
7106 0, /*nb_subtract*/
7107 0, /*nb_multiply*/
Neil Schemenauerce30bc92002-11-18 16:10:18 +00007108 unicode_mod, /*nb_remainder*/
7109};
7110
Guido van Rossumd57fd912000-03-10 22:53:23 +00007111static PySequenceMethods unicode_as_sequence = {
Martin v. Löwis18e16552006-02-15 17:27:45 +00007112 (lenfunc) unicode_length, /* sq_length */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007113 PyUnicode_Concat, /* sq_concat */
Martin v. Löwis18e16552006-02-15 17:27:45 +00007114 (ssizeargfunc) unicode_repeat, /* sq_repeat */
7115 (ssizeargfunc) unicode_getitem, /* sq_item */
7116 (ssizessizeargfunc) unicode_slice, /* sq_slice */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007117 0, /* sq_ass_item */
7118 0, /* sq_ass_slice */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007119 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007120};
7121
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007122static PyObject*
7123unicode_subscript(PyUnicodeObject* self, PyObject* item)
7124{
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00007125 if (PyIndex_Check(item)) {
7126 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007127 if (i == -1 && PyErr_Occurred())
7128 return NULL;
7129 if (i < 0)
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007130 i += PyUnicode_GET_SIZE(self);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007131 return unicode_getitem(self, i);
7132 } else if (PySlice_Check(item)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00007133 Py_ssize_t start, stop, step, slicelength, cur, i;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007134 Py_UNICODE* source_buf;
7135 Py_UNICODE* result_buf;
7136 PyObject* result;
7137
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007138 if (PySlice_GetIndicesEx((PySliceObject*)item, PyUnicode_GET_SIZE(self),
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007139 &start, &stop, &step, &slicelength) < 0) {
7140 return NULL;
7141 }
7142
7143 if (slicelength <= 0) {
7144 return PyUnicode_FromUnicode(NULL, 0);
7145 } else {
7146 source_buf = PyUnicode_AS_UNICODE((PyObject*)self);
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007147 result_buf = (Py_UNICODE *)PyMem_MALLOC(slicelength*
7148 sizeof(Py_UNICODE));
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007149
7150 if (result_buf == NULL)
7151 return PyErr_NoMemory();
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007152
7153 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
7154 result_buf[i] = source_buf[cur];
7155 }
Tim Petersced69f82003-09-16 20:30:58 +00007156
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007157 result = PyUnicode_FromUnicode(result_buf, slicelength);
7158 PyMem_FREE(result_buf);
7159 return result;
7160 }
7161 } else {
7162 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
7163 return NULL;
7164 }
7165}
7166
7167static PyMappingMethods unicode_as_mapping = {
Martin v. Löwis18e16552006-02-15 17:27:45 +00007168 (lenfunc)unicode_length, /* mp_length */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007169 (binaryfunc)unicode_subscript, /* mp_subscript */
7170 (objobjargproc)0, /* mp_ass_subscript */
7171};
7172
Martin v. Löwis18e16552006-02-15 17:27:45 +00007173static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +00007174unicode_buffer_getreadbuf(PyUnicodeObject *self,
Martin v. Löwis18e16552006-02-15 17:27:45 +00007175 Py_ssize_t index,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007176 const void **ptr)
7177{
7178 if (index != 0) {
7179 PyErr_SetString(PyExc_SystemError,
7180 "accessing non-existent unicode segment");
7181 return -1;
7182 }
7183 *ptr = (void *) self->str;
7184 return PyUnicode_GET_DATA_SIZE(self);
7185}
7186
Martin v. Löwis18e16552006-02-15 17:27:45 +00007187static Py_ssize_t
7188unicode_buffer_getwritebuf(PyUnicodeObject *self, Py_ssize_t index,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007189 const void **ptr)
7190{
7191 PyErr_SetString(PyExc_TypeError,
Neal Norwitz20e72132002-06-13 21:25:17 +00007192 "cannot use unicode as modifiable buffer");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007193 return -1;
7194}
7195
7196static int
7197unicode_buffer_getsegcount(PyUnicodeObject *self,
Martin v. Löwis18e16552006-02-15 17:27:45 +00007198 Py_ssize_t *lenp)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007199{
7200 if (lenp)
7201 *lenp = PyUnicode_GET_DATA_SIZE(self);
7202 return 1;
7203}
7204
Martin v. Löwiseb079f12006-02-16 14:32:27 +00007205static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +00007206unicode_buffer_getcharbuf(PyUnicodeObject *self,
Martin v. Löwis18e16552006-02-15 17:27:45 +00007207 Py_ssize_t index,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007208 const void **ptr)
7209{
7210 PyObject *str;
Tim Petersced69f82003-09-16 20:30:58 +00007211
Guido van Rossumd57fd912000-03-10 22:53:23 +00007212 if (index != 0) {
7213 PyErr_SetString(PyExc_SystemError,
7214 "accessing non-existent unicode segment");
7215 return -1;
7216 }
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00007217 str = _PyUnicode_AsDefaultEncodedString((PyObject *)self, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007218 if (str == NULL)
7219 return -1;
7220 *ptr = (void *) PyString_AS_STRING(str);
7221 return PyString_GET_SIZE(str);
7222}
7223
7224/* Helpers for PyUnicode_Format() */
7225
7226static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +00007227getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007228{
Martin v. Löwis18e16552006-02-15 17:27:45 +00007229 Py_ssize_t argidx = *p_argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007230 if (argidx < arglen) {
7231 (*p_argidx)++;
7232 if (arglen < 0)
7233 return args;
7234 else
7235 return PyTuple_GetItem(args, argidx);
7236 }
7237 PyErr_SetString(PyExc_TypeError,
7238 "not enough arguments for format string");
7239 return NULL;
7240}
7241
7242#define F_LJUST (1<<0)
7243#define F_SIGN (1<<1)
7244#define F_BLANK (1<<2)
7245#define F_ALT (1<<3)
7246#define F_ZERO (1<<4)
7247
Martin v. Löwis18e16552006-02-15 17:27:45 +00007248static Py_ssize_t
Neal Norwitzfc76d632006-01-10 06:03:13 +00007249strtounicode(Py_UNICODE *buffer, const char *charbuffer)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007250{
Martin v. Löwis18e16552006-02-15 17:27:45 +00007251 register Py_ssize_t i;
7252 Py_ssize_t len = strlen(charbuffer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007253 for (i = len - 1; i >= 0; i--)
7254 buffer[i] = (Py_UNICODE) charbuffer[i];
7255
Guido van Rossumd57fd912000-03-10 22:53:23 +00007256 return len;
7257}
7258
Neal Norwitzfc76d632006-01-10 06:03:13 +00007259static int
7260doubletounicode(Py_UNICODE *buffer, size_t len, const char *format, double x)
7261{
Tim Peters15231542006-02-16 01:08:01 +00007262 Py_ssize_t result;
7263
Neal Norwitzfc76d632006-01-10 06:03:13 +00007264 PyOS_ascii_formatd((char *)buffer, len, format, x);
Tim Peters15231542006-02-16 01:08:01 +00007265 result = strtounicode(buffer, (char *)buffer);
7266 return Py_SAFE_DOWNCAST(result, Py_ssize_t, int);
Neal Norwitzfc76d632006-01-10 06:03:13 +00007267}
7268
7269static int
7270longtounicode(Py_UNICODE *buffer, size_t len, const char *format, long x)
7271{
Tim Peters15231542006-02-16 01:08:01 +00007272 Py_ssize_t result;
7273
Neal Norwitzfc76d632006-01-10 06:03:13 +00007274 PyOS_snprintf((char *)buffer, len, format, x);
Tim Peters15231542006-02-16 01:08:01 +00007275 result = strtounicode(buffer, (char *)buffer);
7276 return Py_SAFE_DOWNCAST(result, Py_ssize_t, int);
Neal Norwitzfc76d632006-01-10 06:03:13 +00007277}
7278
Guido van Rossum078151d2002-08-11 04:24:12 +00007279/* XXX To save some code duplication, formatfloat/long/int could have been
7280 shared with stringobject.c, converting from 8-bit to Unicode after the
7281 formatting is done. */
7282
Guido van Rossumd57fd912000-03-10 22:53:23 +00007283static int
7284formatfloat(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007285 size_t buflen,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007286 int flags,
7287 int prec,
7288 int type,
7289 PyObject *v)
7290{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007291 /* fmt = '%#.' + `prec` + `type`
7292 worst case length = 3 + 10 (len of INT_MAX) + 1 = 14 (use 20)*/
Guido van Rossumd57fd912000-03-10 22:53:23 +00007293 char fmt[20];
7294 double x;
Tim Petersced69f82003-09-16 20:30:58 +00007295
Guido van Rossumd57fd912000-03-10 22:53:23 +00007296 x = PyFloat_AsDouble(v);
7297 if (x == -1.0 && PyErr_Occurred())
7298 return -1;
7299 if (prec < 0)
7300 prec = 6;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007301 if (type == 'f' && (fabs(x) / 1e25) >= 1e25)
7302 type = 'g';
Marc-André Lemburg79f57832002-12-29 19:44:06 +00007303 /* Worst case length calc to ensure no buffer overrun:
7304
7305 'g' formats:
7306 fmt = %#.<prec>g
7307 buf = '-' + [0-9]*prec + '.' + 'e+' + (longest exp
7308 for any double rep.)
7309 len = 1 + prec + 1 + 2 + 5 = 9 + prec
7310
7311 'f' formats:
7312 buf = '-' + [0-9]*x + '.' + [0-9]*prec (with x < 50)
7313 len = 1 + 50 + 1 + prec = 52 + prec
7314
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007315 If prec=0 the effective precision is 1 (the leading digit is
Tim Petersced69f82003-09-16 20:30:58 +00007316 always given), therefore increase the length by one.
Marc-André Lemburg79f57832002-12-29 19:44:06 +00007317
7318 */
7319 if ((type == 'g' && buflen <= (size_t)10 + (size_t)prec) ||
7320 (type == 'f' && buflen <= (size_t)53 + (size_t)prec)) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007321 PyErr_SetString(PyExc_OverflowError,
Marc-André Lemburg79f57832002-12-29 19:44:06 +00007322 "formatted float is too long (precision too large?)");
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007323 return -1;
7324 }
Marc-André Lemburg79f57832002-12-29 19:44:06 +00007325 PyOS_snprintf(fmt, sizeof(fmt), "%%%s.%d%c",
7326 (flags&F_ALT) ? "#" : "",
7327 prec, type);
Neal Norwitzfc76d632006-01-10 06:03:13 +00007328 return doubletounicode(buf, buflen, fmt, x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007329}
7330
Tim Peters38fd5b62000-09-21 05:43:11 +00007331static PyObject*
7332formatlong(PyObject *val, int flags, int prec, int type)
7333{
7334 char *buf;
7335 int i, len;
7336 PyObject *str; /* temporary string object. */
7337 PyUnicodeObject *result;
7338
7339 str = _PyString_FormatLong(val, flags, prec, type, &buf, &len);
7340 if (!str)
7341 return NULL;
7342 result = _PyUnicode_New(len);
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00007343 if (!result) {
7344 Py_DECREF(str);
7345 return NULL;
7346 }
Tim Peters38fd5b62000-09-21 05:43:11 +00007347 for (i = 0; i < len; i++)
7348 result->str[i] = buf[i];
7349 result->str[len] = 0;
7350 Py_DECREF(str);
7351 return (PyObject*)result;
7352}
7353
Guido van Rossumd57fd912000-03-10 22:53:23 +00007354static int
7355formatint(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007356 size_t buflen,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007357 int flags,
7358 int prec,
7359 int type,
7360 PyObject *v)
7361{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007362 /* fmt = '%#.' + `prec` + 'l' + `type`
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00007363 * worst case length = 3 + 19 (worst len of INT_MAX on 64-bit machine)
7364 * + 1 + 1
7365 * = 24
7366 */
Tim Peters38fd5b62000-09-21 05:43:11 +00007367 char fmt[64]; /* plenty big enough! */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00007368 char *sign;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007369 long x;
7370
7371 x = PyInt_AsLong(v);
7372 if (x == -1 && PyErr_Occurred())
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00007373 return -1;
Guido van Rossum6c9e1302003-11-29 23:52:13 +00007374 if (x < 0 && type == 'u') {
7375 type = 'd';
Guido van Rossum078151d2002-08-11 04:24:12 +00007376 }
Guido van Rossum6c9e1302003-11-29 23:52:13 +00007377 if (x < 0 && (type == 'x' || type == 'X' || type == 'o'))
7378 sign = "-";
7379 else
7380 sign = "";
Guido van Rossumd57fd912000-03-10 22:53:23 +00007381 if (prec < 0)
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00007382 prec = 1;
7383
Guido van Rossum6c9e1302003-11-29 23:52:13 +00007384 /* buf = '+'/'-'/'' + '0'/'0x'/'' + '[0-9]'*max(prec, len(x in octal))
7385 * worst case buf = '-0x' + [0-9]*prec, where prec >= 11
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00007386 */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00007387 if (buflen <= 14 || buflen <= (size_t)3 + (size_t)prec) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007388 PyErr_SetString(PyExc_OverflowError,
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00007389 "formatted integer is too long (precision too large?)");
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007390 return -1;
7391 }
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00007392
7393 if ((flags & F_ALT) &&
7394 (type == 'x' || type == 'X')) {
Tim Petersced69f82003-09-16 20:30:58 +00007395 /* When converting under %#x or %#X, there are a number
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00007396 * of issues that cause pain:
7397 * - when 0 is being converted, the C standard leaves off
7398 * the '0x' or '0X', which is inconsistent with other
7399 * %#x/%#X conversions and inconsistent with Python's
7400 * hex() function
7401 * - there are platforms that violate the standard and
7402 * convert 0 with the '0x' or '0X'
7403 * (Metrowerks, Compaq Tru64)
7404 * - there are platforms that give '0x' when converting
Tim Petersced69f82003-09-16 20:30:58 +00007405 * under %#X, but convert 0 in accordance with the
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00007406 * standard (OS/2 EMX)
Tim Petersced69f82003-09-16 20:30:58 +00007407 *
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00007408 * We can achieve the desired consistency by inserting our
7409 * own '0x' or '0X' prefix, and substituting %x/%X in place
7410 * of %#x/%#X.
7411 *
7412 * Note that this is the same approach as used in
7413 * formatint() in stringobject.c
Andrew MacIntyrec4874392002-02-26 11:36:35 +00007414 */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00007415 PyOS_snprintf(fmt, sizeof(fmt), "%s0%c%%.%dl%c",
7416 sign, type, prec, type);
Andrew MacIntyrec4874392002-02-26 11:36:35 +00007417 }
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00007418 else {
Guido van Rossum6c9e1302003-11-29 23:52:13 +00007419 PyOS_snprintf(fmt, sizeof(fmt), "%s%%%s.%dl%c",
7420 sign, (flags&F_ALT) ? "#" : "",
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00007421 prec, type);
Tim Petersb3d8d1f2001-04-28 05:38:26 +00007422 }
Guido van Rossum6c9e1302003-11-29 23:52:13 +00007423 if (sign[0])
Neal Norwitzfc76d632006-01-10 06:03:13 +00007424 return longtounicode(buf, buflen, fmt, -x);
Guido van Rossum6c9e1302003-11-29 23:52:13 +00007425 else
Neal Norwitzfc76d632006-01-10 06:03:13 +00007426 return longtounicode(buf, buflen, fmt, x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007427}
7428
7429static int
7430formatchar(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007431 size_t buflen,
7432 PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007433{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007434 /* presume that the buffer is at least 2 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00007435 if (PyUnicode_Check(v)) {
7436 if (PyUnicode_GET_SIZE(v) != 1)
7437 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007438 buf[0] = PyUnicode_AS_UNICODE(v)[0];
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00007439 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007440
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00007441 else if (PyString_Check(v)) {
Tim Petersced69f82003-09-16 20:30:58 +00007442 if (PyString_GET_SIZE(v) != 1)
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00007443 goto onError;
7444 buf[0] = (Py_UNICODE)PyString_AS_STRING(v)[0];
7445 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007446
7447 else {
7448 /* Integer input truncated to a character */
7449 long x;
7450 x = PyInt_AsLong(v);
7451 if (x == -1 && PyErr_Occurred())
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00007452 goto onError;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00007453#ifdef Py_UNICODE_WIDE
7454 if (x < 0 || x > 0x10ffff) {
Walter Dörwald44f527f2003-04-02 16:37:24 +00007455 PyErr_SetString(PyExc_OverflowError,
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00007456 "%c arg not in range(0x110000) "
7457 "(wide Python build)");
7458 return -1;
7459 }
7460#else
7461 if (x < 0 || x > 0xffff) {
Walter Dörwald44f527f2003-04-02 16:37:24 +00007462 PyErr_SetString(PyExc_OverflowError,
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00007463 "%c arg not in range(0x10000) "
7464 "(narrow Python build)");
7465 return -1;
7466 }
7467#endif
7468 buf[0] = (Py_UNICODE) x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007469 }
7470 buf[1] = '\0';
7471 return 1;
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00007472
7473 onError:
7474 PyErr_SetString(PyExc_TypeError,
7475 "%c requires int or char");
7476 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007477}
7478
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007479/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
7480
7481 FORMATBUFLEN is the length of the buffer in which the floats, ints, &
7482 chars are formatted. XXX This is a magic number. Each formatting
7483 routine does bounds checking to ensure no overflow, but a better
7484 solution may be to malloc a buffer of appropriate size for each
7485 format. For now, the current solution is sufficient.
7486*/
7487#define FORMATBUFLEN (size_t)120
7488
Guido van Rossumd57fd912000-03-10 22:53:23 +00007489PyObject *PyUnicode_Format(PyObject *format,
7490 PyObject *args)
7491{
7492 Py_UNICODE *fmt, *res;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007493 Py_ssize_t fmtcnt, rescnt, reslen, arglen, argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007494 int args_owned = 0;
7495 PyUnicodeObject *result = NULL;
7496 PyObject *dict = NULL;
7497 PyObject *uformat;
Tim Petersced69f82003-09-16 20:30:58 +00007498
Guido van Rossumd57fd912000-03-10 22:53:23 +00007499 if (format == NULL || args == NULL) {
7500 PyErr_BadInternalCall();
7501 return NULL;
7502 }
7503 uformat = PyUnicode_FromObject(format);
Fred Drakee4315f52000-05-09 19:53:39 +00007504 if (uformat == NULL)
7505 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007506 fmt = PyUnicode_AS_UNICODE(uformat);
7507 fmtcnt = PyUnicode_GET_SIZE(uformat);
7508
7509 reslen = rescnt = fmtcnt + 100;
7510 result = _PyUnicode_New(reslen);
7511 if (result == NULL)
7512 goto onError;
7513 res = PyUnicode_AS_UNICODE(result);
7514
7515 if (PyTuple_Check(args)) {
7516 arglen = PyTuple_Size(args);
7517 argidx = 0;
7518 }
7519 else {
7520 arglen = -1;
7521 argidx = -2;
7522 }
Neal Norwitz80a1bf42002-11-12 23:01:12 +00007523 if (args->ob_type->tp_as_mapping && !PyTuple_Check(args) &&
7524 !PyObject_TypeCheck(args, &PyBaseString_Type))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007525 dict = args;
7526
7527 while (--fmtcnt >= 0) {
7528 if (*fmt != '%') {
7529 if (--rescnt < 0) {
7530 rescnt = fmtcnt + 100;
7531 reslen += rescnt;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00007532 if (_PyUnicode_Resize(&result, reslen) < 0)
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00007533 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007534 res = PyUnicode_AS_UNICODE(result) + reslen - rescnt;
7535 --rescnt;
7536 }
7537 *res++ = *fmt++;
7538 }
7539 else {
7540 /* Got a format specifier */
7541 int flags = 0;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007542 Py_ssize_t width = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007543 int prec = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007544 Py_UNICODE c = '\0';
7545 Py_UNICODE fill;
7546 PyObject *v = NULL;
7547 PyObject *temp = NULL;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007548 Py_UNICODE *pbuf;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007549 Py_UNICODE sign;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007550 Py_ssize_t len;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007551 Py_UNICODE formatbuf[FORMATBUFLEN]; /* For format{float,int,char}() */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007552
7553 fmt++;
7554 if (*fmt == '(') {
7555 Py_UNICODE *keystart;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007556 Py_ssize_t keylen;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007557 PyObject *key;
7558 int pcount = 1;
7559
7560 if (dict == NULL) {
7561 PyErr_SetString(PyExc_TypeError,
Tim Petersced69f82003-09-16 20:30:58 +00007562 "format requires a mapping");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007563 goto onError;
7564 }
7565 ++fmt;
7566 --fmtcnt;
7567 keystart = fmt;
7568 /* Skip over balanced parentheses */
7569 while (pcount > 0 && --fmtcnt >= 0) {
7570 if (*fmt == ')')
7571 --pcount;
7572 else if (*fmt == '(')
7573 ++pcount;
7574 fmt++;
7575 }
7576 keylen = fmt - keystart - 1;
7577 if (fmtcnt < 0 || pcount > 0) {
7578 PyErr_SetString(PyExc_ValueError,
7579 "incomplete format key");
7580 goto onError;
7581 }
Marc-André Lemburg72f82132001-11-20 15:18:49 +00007582#if 0
Fred Drakee4315f52000-05-09 19:53:39 +00007583 /* keys are converted to strings using UTF-8 and
Guido van Rossumd57fd912000-03-10 22:53:23 +00007584 then looked up since Python uses strings to hold
7585 variables names etc. in its namespaces and we
Fred Drakee4315f52000-05-09 19:53:39 +00007586 wouldn't want to break common idioms. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007587 key = PyUnicode_EncodeUTF8(keystart,
7588 keylen,
7589 NULL);
Marc-André Lemburg72f82132001-11-20 15:18:49 +00007590#else
7591 key = PyUnicode_FromUnicode(keystart, keylen);
7592#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00007593 if (key == NULL)
7594 goto onError;
7595 if (args_owned) {
7596 Py_DECREF(args);
7597 args_owned = 0;
7598 }
7599 args = PyObject_GetItem(dict, key);
7600 Py_DECREF(key);
7601 if (args == NULL) {
7602 goto onError;
7603 }
7604 args_owned = 1;
7605 arglen = -1;
7606 argidx = -2;
7607 }
7608 while (--fmtcnt >= 0) {
7609 switch (c = *fmt++) {
7610 case '-': flags |= F_LJUST; continue;
7611 case '+': flags |= F_SIGN; continue;
7612 case ' ': flags |= F_BLANK; continue;
7613 case '#': flags |= F_ALT; continue;
7614 case '0': flags |= F_ZERO; continue;
7615 }
7616 break;
7617 }
7618 if (c == '*') {
7619 v = getnextarg(args, arglen, &argidx);
7620 if (v == NULL)
7621 goto onError;
7622 if (!PyInt_Check(v)) {
7623 PyErr_SetString(PyExc_TypeError,
7624 "* wants int");
7625 goto onError;
7626 }
7627 width = PyInt_AsLong(v);
Guido van Rossumddefaf32007-01-14 03:31:43 +00007628 if (width == -1 && PyErr_Occurred())
7629 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007630 if (width < 0) {
7631 flags |= F_LJUST;
7632 width = -width;
7633 }
7634 if (--fmtcnt >= 0)
7635 c = *fmt++;
7636 }
7637 else if (c >= '0' && c <= '9') {
7638 width = c - '0';
7639 while (--fmtcnt >= 0) {
7640 c = *fmt++;
7641 if (c < '0' || c > '9')
7642 break;
7643 if ((width*10) / 10 != width) {
7644 PyErr_SetString(PyExc_ValueError,
7645 "width too big");
7646 goto onError;
7647 }
7648 width = width*10 + (c - '0');
7649 }
7650 }
7651 if (c == '.') {
7652 prec = 0;
7653 if (--fmtcnt >= 0)
7654 c = *fmt++;
7655 if (c == '*') {
7656 v = getnextarg(args, arglen, &argidx);
7657 if (v == NULL)
7658 goto onError;
7659 if (!PyInt_Check(v)) {
7660 PyErr_SetString(PyExc_TypeError,
7661 "* wants int");
7662 goto onError;
7663 }
7664 prec = PyInt_AsLong(v);
Guido van Rossumddefaf32007-01-14 03:31:43 +00007665 if (prec == -1 && PyErr_Occurred())
7666 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007667 if (prec < 0)
7668 prec = 0;
7669 if (--fmtcnt >= 0)
7670 c = *fmt++;
7671 }
7672 else if (c >= '0' && c <= '9') {
7673 prec = c - '0';
7674 while (--fmtcnt >= 0) {
7675 c = Py_CHARMASK(*fmt++);
7676 if (c < '0' || c > '9')
7677 break;
7678 if ((prec*10) / 10 != prec) {
7679 PyErr_SetString(PyExc_ValueError,
7680 "prec too big");
7681 goto onError;
7682 }
7683 prec = prec*10 + (c - '0');
7684 }
7685 }
7686 } /* prec */
7687 if (fmtcnt >= 0) {
7688 if (c == 'h' || c == 'l' || c == 'L') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007689 if (--fmtcnt >= 0)
7690 c = *fmt++;
7691 }
7692 }
7693 if (fmtcnt < 0) {
7694 PyErr_SetString(PyExc_ValueError,
7695 "incomplete format");
7696 goto onError;
7697 }
7698 if (c != '%') {
7699 v = getnextarg(args, arglen, &argidx);
7700 if (v == NULL)
7701 goto onError;
7702 }
7703 sign = 0;
7704 fill = ' ';
7705 switch (c) {
7706
7707 case '%':
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007708 pbuf = formatbuf;
7709 /* presume that buffer length is at least 1 */
7710 pbuf[0] = '%';
Guido van Rossumd57fd912000-03-10 22:53:23 +00007711 len = 1;
7712 break;
7713
7714 case 's':
7715 case 'r':
7716 if (PyUnicode_Check(v) && c == 's') {
7717 temp = v;
7718 Py_INCREF(temp);
7719 }
7720 else {
7721 PyObject *unicode;
7722 if (c == 's')
Marc-André Lemburgd25c6502004-07-23 16:13:25 +00007723 temp = PyObject_Unicode(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007724 else
7725 temp = PyObject_Repr(v);
7726 if (temp == NULL)
7727 goto onError;
Marc-André Lemburgd25c6502004-07-23 16:13:25 +00007728 if (PyUnicode_Check(temp))
7729 /* nothing to do */;
7730 else if (PyString_Check(temp)) {
7731 /* convert to string to Unicode */
Thomas Woutersa96affe2006-03-12 00:29:36 +00007732 unicode = PyUnicode_Decode(PyString_AS_STRING(temp),
Guido van Rossumd57fd912000-03-10 22:53:23 +00007733 PyString_GET_SIZE(temp),
Thomas Woutersa96affe2006-03-12 00:29:36 +00007734 NULL,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007735 "strict");
Thomas Woutersa96affe2006-03-12 00:29:36 +00007736 Py_DECREF(temp);
7737 temp = unicode;
7738 if (temp == NULL)
7739 goto onError;
7740 }
Marc-André Lemburgd25c6502004-07-23 16:13:25 +00007741 else {
7742 Py_DECREF(temp);
7743 PyErr_SetString(PyExc_TypeError,
7744 "%s argument has non-string str()");
7745 goto onError;
7746 }
7747 }
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007748 pbuf = PyUnicode_AS_UNICODE(temp);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007749 len = PyUnicode_GET_SIZE(temp);
7750 if (prec >= 0 && len > prec)
7751 len = prec;
7752 break;
7753
7754 case 'i':
7755 case 'd':
7756 case 'u':
7757 case 'o':
7758 case 'x':
7759 case 'X':
7760 if (c == 'i')
7761 c = 'd';
Tim Petersa3a3a032000-11-30 05:22:44 +00007762 if (PyLong_Check(v)) {
Tim Peters38fd5b62000-09-21 05:43:11 +00007763 temp = formatlong(v, flags, prec, c);
7764 if (!temp)
7765 goto onError;
7766 pbuf = PyUnicode_AS_UNICODE(temp);
7767 len = PyUnicode_GET_SIZE(temp);
Tim Peters38fd5b62000-09-21 05:43:11 +00007768 sign = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007769 }
Tim Peters38fd5b62000-09-21 05:43:11 +00007770 else {
7771 pbuf = formatbuf;
7772 len = formatint(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
7773 flags, prec, c, v);
7774 if (len < 0)
7775 goto onError;
Guido van Rossum6c9e1302003-11-29 23:52:13 +00007776 sign = 1;
Tim Peters38fd5b62000-09-21 05:43:11 +00007777 }
7778 if (flags & F_ZERO)
7779 fill = '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00007780 break;
7781
7782 case 'e':
7783 case 'E':
7784 case 'f':
Raymond Hettinger9bfe5332003-08-27 04:55:52 +00007785 case 'F':
Guido van Rossumd57fd912000-03-10 22:53:23 +00007786 case 'g':
7787 case 'G':
Raymond Hettinger9bfe5332003-08-27 04:55:52 +00007788 if (c == 'F')
7789 c = 'f';
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007790 pbuf = formatbuf;
7791 len = formatfloat(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
7792 flags, prec, c, v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007793 if (len < 0)
7794 goto onError;
7795 sign = 1;
Tim Peters38fd5b62000-09-21 05:43:11 +00007796 if (flags & F_ZERO)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007797 fill = '0';
7798 break;
7799
7800 case 'c':
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007801 pbuf = formatbuf;
7802 len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007803 if (len < 0)
7804 goto onError;
7805 break;
7806
7807 default:
7808 PyErr_Format(PyExc_ValueError,
Andrew M. Kuchling6ca89172000-12-15 13:07:46 +00007809 "unsupported format character '%c' (0x%x) "
Thomas Wouters89f507f2006-12-13 04:49:30 +00007810 "at index %zd",
Tim Petersced69f82003-09-16 20:30:58 +00007811 (31<=c && c<=126) ? (char)c : '?',
Marc-André Lemburg24e53b62002-09-24 09:32:14 +00007812 (int)c,
Thomas Wouters89f507f2006-12-13 04:49:30 +00007813 (Py_ssize_t)(fmt - 1 -
7814 PyUnicode_AS_UNICODE(uformat)));
Guido van Rossumd57fd912000-03-10 22:53:23 +00007815 goto onError;
7816 }
7817 if (sign) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007818 if (*pbuf == '-' || *pbuf == '+') {
7819 sign = *pbuf++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007820 len--;
7821 }
7822 else if (flags & F_SIGN)
7823 sign = '+';
7824 else if (flags & F_BLANK)
7825 sign = ' ';
7826 else
7827 sign = 0;
7828 }
7829 if (width < len)
7830 width = len;
Guido van Rossum049cd6b2002-10-11 00:43:48 +00007831 if (rescnt - (sign != 0) < width) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007832 reslen -= rescnt;
7833 rescnt = width + fmtcnt + 100;
7834 reslen += rescnt;
Guido van Rossum049cd6b2002-10-11 00:43:48 +00007835 if (reslen < 0) {
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00007836 Py_XDECREF(temp);
Thomas Woutersa96affe2006-03-12 00:29:36 +00007837 PyErr_NoMemory();
7838 goto onError;
Guido van Rossum049cd6b2002-10-11 00:43:48 +00007839 }
Thomas Woutersa96affe2006-03-12 00:29:36 +00007840 if (_PyUnicode_Resize(&result, reslen) < 0) {
7841 Py_XDECREF(temp);
7842 goto onError;
7843 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007844 res = PyUnicode_AS_UNICODE(result)
7845 + reslen - rescnt;
7846 }
7847 if (sign) {
7848 if (fill != ' ')
7849 *res++ = sign;
7850 rescnt--;
7851 if (width > len)
7852 width--;
7853 }
Tim Peters38fd5b62000-09-21 05:43:11 +00007854 if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
7855 assert(pbuf[0] == '0');
Tim Petersfff53252001-04-12 18:38:48 +00007856 assert(pbuf[1] == c);
7857 if (fill != ' ') {
7858 *res++ = *pbuf++;
7859 *res++ = *pbuf++;
Tim Peters38fd5b62000-09-21 05:43:11 +00007860 }
Tim Petersfff53252001-04-12 18:38:48 +00007861 rescnt -= 2;
7862 width -= 2;
7863 if (width < 0)
7864 width = 0;
7865 len -= 2;
Tim Peters38fd5b62000-09-21 05:43:11 +00007866 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007867 if (width > len && !(flags & F_LJUST)) {
7868 do {
7869 --rescnt;
7870 *res++ = fill;
7871 } while (--width > len);
7872 }
Tim Peters38fd5b62000-09-21 05:43:11 +00007873 if (fill == ' ') {
7874 if (sign)
7875 *res++ = sign;
Tim Petersfff53252001-04-12 18:38:48 +00007876 if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
Tim Peters38fd5b62000-09-21 05:43:11 +00007877 assert(pbuf[0] == '0');
Tim Petersfff53252001-04-12 18:38:48 +00007878 assert(pbuf[1] == c);
Tim Peters38fd5b62000-09-21 05:43:11 +00007879 *res++ = *pbuf++;
7880 *res++ = *pbuf++;
7881 }
7882 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00007883 Py_UNICODE_COPY(res, pbuf, len);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007884 res += len;
7885 rescnt -= len;
7886 while (--width >= len) {
7887 --rescnt;
7888 *res++ = ' ';
7889 }
7890 if (dict && (argidx < arglen) && c != '%') {
7891 PyErr_SetString(PyExc_TypeError,
Raymond Hettinger0ebac972002-05-21 15:14:57 +00007892 "not all arguments converted during string formatting");
Thomas Woutersa96affe2006-03-12 00:29:36 +00007893 Py_XDECREF(temp);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007894 goto onError;
7895 }
7896 Py_XDECREF(temp);
7897 } /* '%' */
7898 } /* until end */
7899 if (argidx < arglen && !dict) {
7900 PyErr_SetString(PyExc_TypeError,
Raymond Hettinger0ebac972002-05-21 15:14:57 +00007901 "not all arguments converted during string formatting");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007902 goto onError;
7903 }
7904
Thomas Woutersa96affe2006-03-12 00:29:36 +00007905 if (_PyUnicode_Resize(&result, reslen - rescnt) < 0)
7906 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007907 if (args_owned) {
7908 Py_DECREF(args);
7909 }
7910 Py_DECREF(uformat);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007911 return (PyObject *)result;
7912
7913 onError:
7914 Py_XDECREF(result);
7915 Py_DECREF(uformat);
7916 if (args_owned) {
7917 Py_DECREF(args);
7918 }
7919 return NULL;
7920}
7921
7922static PyBufferProcs unicode_as_buffer = {
Martin v. Löwis18e16552006-02-15 17:27:45 +00007923 (readbufferproc) unicode_buffer_getreadbuf,
7924 (writebufferproc) unicode_buffer_getwritebuf,
7925 (segcountproc) unicode_buffer_getsegcount,
7926 (charbufferproc) unicode_buffer_getcharbuf,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007927};
7928
Jeremy Hylton938ace62002-07-17 16:30:39 +00007929static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +00007930unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
7931
Tim Peters6d6c1a32001-08-02 04:15:00 +00007932static PyObject *
7933unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
7934{
7935 PyObject *x = NULL;
Martin v. Löwis15e62742006-02-27 16:46:16 +00007936 static char *kwlist[] = {"string", "encoding", "errors", 0};
Tim Peters6d6c1a32001-08-02 04:15:00 +00007937 char *encoding = NULL;
7938 char *errors = NULL;
7939
Guido van Rossume023fe02001-08-30 03:12:59 +00007940 if (type != &PyUnicode_Type)
7941 return unicode_subtype_new(type, args, kwds);
Tim Peters6d6c1a32001-08-02 04:15:00 +00007942 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:unicode",
7943 kwlist, &x, &encoding, &errors))
7944 return NULL;
7945 if (x == NULL)
7946 return (PyObject *)_PyUnicode_New(0);
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00007947 if (encoding == NULL && errors == NULL)
7948 return PyObject_Unicode(x);
7949 else
Tim Peters6d6c1a32001-08-02 04:15:00 +00007950 return PyUnicode_FromEncodedObject(x, encoding, errors);
7951}
7952
Guido van Rossume023fe02001-08-30 03:12:59 +00007953static PyObject *
7954unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
7955{
Tim Petersaf90b3e2001-09-12 05:18:58 +00007956 PyUnicodeObject *tmp, *pnew;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007957 Py_ssize_t n;
Guido van Rossume023fe02001-08-30 03:12:59 +00007958
7959 assert(PyType_IsSubtype(type, &PyUnicode_Type));
7960 tmp = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds);
7961 if (tmp == NULL)
7962 return NULL;
7963 assert(PyUnicode_Check(tmp));
Tim Petersaf90b3e2001-09-12 05:18:58 +00007964 pnew = (PyUnicodeObject *) type->tp_alloc(type, n = tmp->length);
Raymond Hettingerf4667932003-06-28 20:04:25 +00007965 if (pnew == NULL) {
7966 Py_DECREF(tmp);
Guido van Rossume023fe02001-08-30 03:12:59 +00007967 return NULL;
Raymond Hettingerf4667932003-06-28 20:04:25 +00007968 }
Tim Petersaf90b3e2001-09-12 05:18:58 +00007969 pnew->str = PyMem_NEW(Py_UNICODE, n+1);
7970 if (pnew->str == NULL) {
7971 _Py_ForgetReference((PyObject *)pnew);
Neil Schemenauer58aa8612002-04-12 03:07:20 +00007972 PyObject_Del(pnew);
Raymond Hettingerf4667932003-06-28 20:04:25 +00007973 Py_DECREF(tmp);
Neal Norwitzec74f2f2003-02-11 23:05:40 +00007974 return PyErr_NoMemory();
Guido van Rossume023fe02001-08-30 03:12:59 +00007975 }
Tim Petersaf90b3e2001-09-12 05:18:58 +00007976 Py_UNICODE_COPY(pnew->str, tmp->str, n+1);
7977 pnew->length = n;
7978 pnew->hash = tmp->hash;
Guido van Rossume023fe02001-08-30 03:12:59 +00007979 Py_DECREF(tmp);
Tim Petersaf90b3e2001-09-12 05:18:58 +00007980 return (PyObject *)pnew;
Guido van Rossume023fe02001-08-30 03:12:59 +00007981}
7982
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007983PyDoc_STRVAR(unicode_doc,
Tim Peters6d6c1a32001-08-02 04:15:00 +00007984"unicode(string [, encoding[, errors]]) -> object\n\
7985\n\
7986Create a new Unicode object from the given encoded string.\n\
Skip Montanaro35b37a52002-07-26 16:22:46 +00007987encoding defaults to the current default string encoding.\n\
7988errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +00007989
Guido van Rossum50e9fb92006-08-17 05:42:55 +00007990static PyObject *unicode_iter(PyObject *seq);
7991
Guido van Rossumd57fd912000-03-10 22:53:23 +00007992PyTypeObject PyUnicode_Type = {
7993 PyObject_HEAD_INIT(&PyType_Type)
7994 0, /* ob_size */
Guido van Rossum84fc66d2007-05-03 17:18:26 +00007995 "str", /* tp_name */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007996 sizeof(PyUnicodeObject), /* tp_size */
7997 0, /* tp_itemsize */
7998 /* Slots */
Guido van Rossum9475a232001-10-05 20:51:39 +00007999 (destructor)unicode_dealloc, /* tp_dealloc */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008000 0, /* tp_print */
Tim Peters6d6c1a32001-08-02 04:15:00 +00008001 0, /* tp_getattr */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008002 0, /* tp_setattr */
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00008003 0, /* tp_compare */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008004 unicode_repr, /* tp_repr */
Neil Schemenauerce30bc92002-11-18 16:10:18 +00008005 &unicode_as_number, /* tp_as_number */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008006 &unicode_as_sequence, /* tp_as_sequence */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008007 &unicode_as_mapping, /* tp_as_mapping */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008008 (hashfunc) unicode_hash, /* tp_hash*/
8009 0, /* tp_call*/
8010 (reprfunc) unicode_str, /* tp_str */
Tim Peters6d6c1a32001-08-02 04:15:00 +00008011 PyObject_GenericGetAttr, /* tp_getattro */
8012 0, /* tp_setattro */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008013 &unicode_as_buffer, /* tp_as_buffer */
Thomas Wouters27d517b2007-02-25 20:39:11 +00008014 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
8015 Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
Tim Peters6d6c1a32001-08-02 04:15:00 +00008016 unicode_doc, /* tp_doc */
8017 0, /* tp_traverse */
8018 0, /* tp_clear */
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00008019 PyUnicode_RichCompare, /* tp_richcompare */
Tim Peters6d6c1a32001-08-02 04:15:00 +00008020 0, /* tp_weaklistoffset */
Guido van Rossum50e9fb92006-08-17 05:42:55 +00008021 unicode_iter, /* tp_iter */
Tim Peters6d6c1a32001-08-02 04:15:00 +00008022 0, /* tp_iternext */
8023 unicode_methods, /* tp_methods */
8024 0, /* tp_members */
8025 0, /* tp_getset */
Guido van Rossumcacfc072002-05-24 19:01:59 +00008026 &PyBaseString_Type, /* tp_base */
Tim Peters6d6c1a32001-08-02 04:15:00 +00008027 0, /* tp_dict */
8028 0, /* tp_descr_get */
8029 0, /* tp_descr_set */
8030 0, /* tp_dictoffset */
8031 0, /* tp_init */
8032 0, /* tp_alloc */
8033 unicode_new, /* tp_new */
Neil Schemenauer58aa8612002-04-12 03:07:20 +00008034 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008035};
8036
8037/* Initialize the Unicode implementation */
8038
Thomas Wouters78890102000-07-22 19:25:51 +00008039void _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008040{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008041 int i;
8042
Thomas Wouters477c8d52006-05-27 19:21:47 +00008043 /* XXX - move this array to unicodectype.c ? */
8044 Py_UNICODE linebreak[] = {
8045 0x000A, /* LINE FEED */
8046 0x000D, /* CARRIAGE RETURN */
8047 0x001C, /* FILE SEPARATOR */
8048 0x001D, /* GROUP SEPARATOR */
8049 0x001E, /* RECORD SEPARATOR */
8050 0x0085, /* NEXT LINE */
8051 0x2028, /* LINE SEPARATOR */
8052 0x2029, /* PARAGRAPH SEPARATOR */
8053 };
8054
Fred Drakee4315f52000-05-09 19:53:39 +00008055 /* Init the implementation */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008056 unicode_freelist = NULL;
8057 unicode_freelist_size = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008058 unicode_empty = _PyUnicode_New(0);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008059 if (!unicode_empty)
8060 return;
8061
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008062 for (i = 0; i < 256; i++)
8063 unicode_latin1[i] = NULL;
Guido van Rossumcacfc072002-05-24 19:01:59 +00008064 if (PyType_Ready(&PyUnicode_Type) < 0)
8065 Py_FatalError("Can't initialize 'unicode'");
Thomas Wouters477c8d52006-05-27 19:21:47 +00008066
8067 /* initialize the linebreak bloom filter */
8068 bloom_linebreak = make_bloom_mask(
8069 linebreak, sizeof(linebreak) / sizeof(linebreak[0])
8070 );
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008071
8072 PyType_Ready(&EncodingMapType);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008073}
8074
8075/* Finalize the Unicode implementation */
8076
8077void
Thomas Wouters78890102000-07-22 19:25:51 +00008078_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008079{
Barry Warsaw5b4c2282000-10-03 20:45:26 +00008080 PyUnicodeObject *u;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008081 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008082
Guido van Rossum4ae8ef82000-10-03 18:09:04 +00008083 Py_XDECREF(unicode_empty);
8084 unicode_empty = NULL;
Barry Warsaw5b4c2282000-10-03 20:45:26 +00008085
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008086 for (i = 0; i < 256; i++) {
8087 if (unicode_latin1[i]) {
8088 Py_DECREF(unicode_latin1[i]);
8089 unicode_latin1[i] = NULL;
8090 }
8091 }
8092
Barry Warsaw5b4c2282000-10-03 20:45:26 +00008093 for (u = unicode_freelist; u != NULL;) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00008094 PyUnicodeObject *v = u;
8095 u = *(PyUnicodeObject **)u;
Guido van Rossumfd4b9572000-04-10 13:51:10 +00008096 if (v->str)
Guido van Rossumb18618d2000-05-03 23:44:39 +00008097 PyMem_DEL(v->str);
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00008098 Py_XDECREF(v->defenc);
Neil Schemenauer58aa8612002-04-12 03:07:20 +00008099 PyObject_Del(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008100 }
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008101 unicode_freelist = NULL;
8102 unicode_freelist_size = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008103}
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00008104
Guido van Rossum50e9fb92006-08-17 05:42:55 +00008105
8106
8107/********************* Unicode Iterator **************************/
8108
8109typedef struct {
8110 PyObject_HEAD
Guido van Rossum49d6b072006-08-17 21:11:47 +00008111 Py_ssize_t it_index;
Guido van Rossum50e9fb92006-08-17 05:42:55 +00008112 PyUnicodeObject *it_seq; /* Set to NULL when iterator is exhausted */
8113} unicodeiterobject;
8114
8115static void
8116unicodeiter_dealloc(unicodeiterobject *it)
8117{
8118 _PyObject_GC_UNTRACK(it);
8119 Py_XDECREF(it->it_seq);
8120 PyObject_GC_Del(it);
8121}
8122
8123static int
8124unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
8125{
8126 Py_VISIT(it->it_seq);
8127 return 0;
8128}
8129
8130static PyObject *
8131unicodeiter_next(unicodeiterobject *it)
8132{
8133 PyUnicodeObject *seq;
8134 PyObject *item;
8135
8136 assert(it != NULL);
8137 seq = it->it_seq;
8138 if (seq == NULL)
8139 return NULL;
8140 assert(PyUnicode_Check(seq));
8141
8142 if (it->it_index < PyUnicode_GET_SIZE(seq)) {
Guido van Rossum49d6b072006-08-17 21:11:47 +00008143 item = PyUnicode_FromUnicode(
8144 PyUnicode_AS_UNICODE(seq)+it->it_index, 1);
Guido van Rossum50e9fb92006-08-17 05:42:55 +00008145 if (item != NULL)
8146 ++it->it_index;
8147 return item;
8148 }
8149
8150 Py_DECREF(seq);
8151 it->it_seq = NULL;
8152 return NULL;
8153}
8154
8155static PyObject *
8156unicodeiter_len(unicodeiterobject *it)
8157{
8158 Py_ssize_t len = 0;
8159 if (it->it_seq)
8160 len = PyUnicode_GET_SIZE(it->it_seq) - it->it_index;
8161 return PyInt_FromSsize_t(len);
8162}
8163
8164PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
8165
8166static PyMethodDef unicodeiter_methods[] = {
Guido van Rossum49d6b072006-08-17 21:11:47 +00008167 {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
8168 length_hint_doc},
Guido van Rossum50e9fb92006-08-17 05:42:55 +00008169 {NULL, NULL} /* sentinel */
8170};
8171
8172PyTypeObject PyUnicodeIter_Type = {
8173 PyObject_HEAD_INIT(&PyType_Type)
8174 0, /* ob_size */
8175 "unicodeiterator", /* tp_name */
8176 sizeof(unicodeiterobject), /* tp_basicsize */
8177 0, /* tp_itemsize */
8178 /* methods */
Guido van Rossum49d6b072006-08-17 21:11:47 +00008179 (destructor)unicodeiter_dealloc, /* tp_dealloc */
Guido van Rossum50e9fb92006-08-17 05:42:55 +00008180 0, /* tp_print */
8181 0, /* tp_getattr */
8182 0, /* tp_setattr */
8183 0, /* tp_compare */
8184 0, /* tp_repr */
8185 0, /* tp_as_number */
8186 0, /* tp_as_sequence */
8187 0, /* tp_as_mapping */
8188 0, /* tp_hash */
8189 0, /* tp_call */
8190 0, /* tp_str */
8191 PyObject_GenericGetAttr, /* tp_getattro */
8192 0, /* tp_setattro */
8193 0, /* tp_as_buffer */
8194 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
8195 0, /* tp_doc */
8196 (traverseproc)unicodeiter_traverse, /* tp_traverse */
8197 0, /* tp_clear */
8198 0, /* tp_richcompare */
8199 0, /* tp_weaklistoffset */
8200 PyObject_SelfIter, /* tp_iter */
8201 (iternextfunc)unicodeiter_next, /* tp_iternext */
8202 unicodeiter_methods, /* tp_methods */
8203 0,
8204};
8205
8206static PyObject *
8207unicode_iter(PyObject *seq)
8208{
8209 unicodeiterobject *it;
8210
8211 if (!PyUnicode_Check(seq)) {
8212 PyErr_BadInternalCall();
8213 return NULL;
8214 }
8215 it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
8216 if (it == NULL)
8217 return NULL;
8218 it->it_index = 0;
8219 Py_INCREF(seq);
8220 it->it_seq = (PyUnicodeObject *)seq;
8221 _PyObject_GC_TRACK(it);
8222 return (PyObject *)it;
8223}
8224
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008225#ifdef __cplusplus
8226}
8227#endif
8228
8229
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00008230/*
8231Local variables:
8232c-basic-offset: 4
8233indent-tabs-mode: nil
8234End:
8235*/