blob: af98a90efbd99d1628b241c2a965eaf29638e84e [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Fred Drake785d14f2000-05-09 19:54:43 +00004modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
Guido van Rossumd57fd912000-03-10 22:53:23 +00005Unicode Integration Proposal (see file Misc/unicode.txt).
6
Thomas Wouters477c8d52006-05-27 19:21:47 +00007Major speed upgrades to the method implementations at the Reykjavik
8NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
9
Guido van Rossum16b1ad92000-08-03 16:24:25 +000010Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000011
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000012--------------------------------------------------------------------
13The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000014
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000015 Copyright (c) 1999 by Secret Labs AB
16 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000017
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000018By obtaining, using, and/or copying this software and/or its
19associated documentation, you agree that you have read, understood,
20and will comply with the following terms and conditions:
21
22Permission to use, copy, modify, and distribute this software and its
23associated documentation for any purpose and without fee is hereby
24granted, provided that the above copyright notice appears in all
25copies, and that both that copyright notice and this permission notice
26appear in supporting documentation, and that the name of Secret Labs
27AB or the author not be used in advertising or publicity pertaining to
28distribution of the software without specific, written prior
29permission.
30
31SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
32THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
33FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
34ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
35WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
36ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
37OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
38--------------------------------------------------------------------
39
40*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000041
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000042#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000043#include "Python.h"
44
Guido van Rossumd57fd912000-03-10 22:53:23 +000045#include "unicodeobject.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000046#include "ucnhash.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000047
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000048#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000049#include <windows.h>
50#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000051
Guido van Rossumd57fd912000-03-10 22:53:23 +000052/* Limit for the Unicode object free list */
53
54#define MAX_UNICODE_FREELIST_SIZE 1024
55
56/* Limit for the Unicode object free list stay alive optimization.
57
58 The implementation will keep allocated Unicode memory intact for
59 all objects on the free list having a size less than this
Tim Petersced69f82003-09-16 20:30:58 +000060 limit. This reduces malloc() overhead for small Unicode objects.
Guido van Rossumd57fd912000-03-10 22:53:23 +000061
Barry Warsaw51ac5802000-03-20 16:36:48 +000062 At worst this will result in MAX_UNICODE_FREELIST_SIZE *
Guido van Rossumfd4b9572000-04-10 13:51:10 +000063 (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
Guido van Rossumd57fd912000-03-10 22:53:23 +000064 malloc()-overhead) bytes of unused garbage.
65
66 Setting the limit to 0 effectively turns the feature off.
67
Guido van Rossumfd4b9572000-04-10 13:51:10 +000068 Note: This is an experimental feature ! If you get core dumps when
69 using Unicode objects, turn this feature off.
Guido van Rossumd57fd912000-03-10 22:53:23 +000070
71*/
72
Guido van Rossumfd4b9572000-04-10 13:51:10 +000073#define KEEPALIVE_SIZE_LIMIT 9
Guido van Rossumd57fd912000-03-10 22:53:23 +000074
75/* Endianness switches; defaults to little endian */
76
77#ifdef WORDS_BIGENDIAN
78# define BYTEORDER_IS_BIG_ENDIAN
79#else
80# define BYTEORDER_IS_LITTLE_ENDIAN
81#endif
82
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000083/* --- Globals ------------------------------------------------------------
84
85 The globals are initialized by the _PyUnicode_Init() API and should
86 not be used before calling that API.
87
88*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000089
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000090
91#ifdef __cplusplus
92extern "C" {
93#endif
94
Guido van Rossumd57fd912000-03-10 22:53:23 +000095/* Free list for Unicode objects */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000096static PyUnicodeObject *unicode_freelist;
97static int unicode_freelist_size;
Guido van Rossumd57fd912000-03-10 22:53:23 +000098
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000099/* The empty Unicode object is shared to improve performance. */
100static PyUnicodeObject *unicode_empty;
101
102/* Single character Unicode strings in the Latin-1 range are being
103 shared as well. */
104static PyUnicodeObject *unicode_latin1[256];
105
Fred Drakee4315f52000-05-09 19:53:39 +0000106/* Default encoding to use and assume when NULL is passed as encoding
Guido van Rossumf15a29f2007-05-04 00:41:39 +0000107 parameter; it is fixed to "utf-8". Always use the
108 PyUnicode_GetDefaultEncoding() API to access this global. */
109static const char unicode_default_encoding[] = "utf-8";
Fred Drakee4315f52000-05-09 19:53:39 +0000110
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000111Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000112PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000113{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000114#ifdef Py_UNICODE_WIDE
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000115 return 0x10FFFF;
116#else
117 /* This is actually an illegal character, so it should
118 not be passed to unichr. */
119 return 0xFFFF;
120#endif
121}
122
Thomas Wouters477c8d52006-05-27 19:21:47 +0000123/* --- Bloom Filters ----------------------------------------------------- */
124
125/* stuff to implement simple "bloom filters" for Unicode characters.
126 to keep things simple, we use a single bitmask, using the least 5
127 bits from each unicode characters as the bit index. */
128
129/* the linebreak mask is set up by Unicode_Init below */
130
131#define BLOOM_MASK unsigned long
132
133static BLOOM_MASK bloom_linebreak;
134
135#define BLOOM(mask, ch) ((mask & (1 << ((ch) & 0x1F))))
136
137#define BLOOM_LINEBREAK(ch)\
138 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK((ch)))
139
140Py_LOCAL_INLINE(BLOOM_MASK) make_bloom_mask(Py_UNICODE* ptr, Py_ssize_t len)
141{
142 /* calculate simple bloom-style bitmask for a given unicode string */
143
144 long mask;
145 Py_ssize_t i;
146
147 mask = 0;
148 for (i = 0; i < len; i++)
149 mask |= (1 << (ptr[i] & 0x1F));
150
151 return mask;
152}
153
154Py_LOCAL_INLINE(int) unicode_member(Py_UNICODE chr, Py_UNICODE* set, Py_ssize_t setlen)
155{
156 Py_ssize_t i;
157
158 for (i = 0; i < setlen; i++)
159 if (set[i] == chr)
160 return 1;
161
162 return 0;
163}
164
165#define BLOOM_MEMBER(mask, chr, set, setlen)\
166 BLOOM(mask, chr) && unicode_member(chr, set, setlen)
167
Guido van Rossumd57fd912000-03-10 22:53:23 +0000168/* --- Unicode Object ----------------------------------------------------- */
169
170static
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000171int unicode_resize(register PyUnicodeObject *unicode,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000172 Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000173{
174 void *oldstr;
Tim Petersced69f82003-09-16 20:30:58 +0000175
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000176 /* Shortcut if there's nothing much to do. */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000177 if (unicode->length == length)
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000178 goto reset;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000179
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000180 /* Resizing shared object (unicode_empty or single character
181 objects) in-place is not allowed. Use PyUnicode_Resize()
182 instead ! */
Thomas Wouters477c8d52006-05-27 19:21:47 +0000183
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000184 if (unicode == unicode_empty ||
185 (unicode->length == 1 &&
186 unicode->str[0] < 256U &&
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000187 unicode_latin1[unicode->str[0]] == unicode)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000188 PyErr_SetString(PyExc_SystemError,
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000189 "can't resize shared unicode objects");
Guido van Rossumd57fd912000-03-10 22:53:23 +0000190 return -1;
191 }
192
Thomas Wouters477c8d52006-05-27 19:21:47 +0000193 /* We allocate one more byte to make sure the string is Ux0000 terminated.
194 The overallocation is also used by fastsearch, which assumes that it's
195 safe to look at str[length] (without making any assumptions about what
196 it contains). */
197
Guido van Rossumd57fd912000-03-10 22:53:23 +0000198 oldstr = unicode->str;
199 PyMem_RESIZE(unicode->str, Py_UNICODE, length + 1);
200 if (!unicode->str) {
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000201 unicode->str = (Py_UNICODE *)oldstr;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000202 PyErr_NoMemory();
203 return -1;
204 }
205 unicode->str[length] = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000206 unicode->length = length;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000207
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000208 reset:
Guido van Rossumd57fd912000-03-10 22:53:23 +0000209 /* Reset the object caches */
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000210 if (unicode->defenc) {
211 Py_DECREF(unicode->defenc);
212 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000213 }
214 unicode->hash = -1;
Tim Petersced69f82003-09-16 20:30:58 +0000215
Guido van Rossumd57fd912000-03-10 22:53:23 +0000216 return 0;
217}
218
219/* We allocate one more byte to make sure the string is
Tim Petersced69f82003-09-16 20:30:58 +0000220 Ux0000 terminated -- XXX is this needed ?
Guido van Rossumd57fd912000-03-10 22:53:23 +0000221
222 XXX This allocator could further be enhanced by assuring that the
223 free list never reduces its size below 1.
224
225*/
226
227static
Martin v. Löwis18e16552006-02-15 17:27:45 +0000228PyUnicodeObject *_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000229{
230 register PyUnicodeObject *unicode;
231
Thomas Wouters477c8d52006-05-27 19:21:47 +0000232 /* Optimization for empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000233 if (length == 0 && unicode_empty != NULL) {
234 Py_INCREF(unicode_empty);
235 return unicode_empty;
236 }
237
238 /* Unicode freelist & memory allocation */
239 if (unicode_freelist) {
240 unicode = unicode_freelist;
Marc-André Lemburgbea47e72000-06-17 20:31:17 +0000241 unicode_freelist = *(PyUnicodeObject **)unicode;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000242 unicode_freelist_size--;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000243 if (unicode->str) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000244 /* Keep-Alive optimization: we only upsize the buffer,
245 never downsize it. */
Tim Petersced69f82003-09-16 20:30:58 +0000246 if ((unicode->length < length) &&
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +0000247 unicode_resize(unicode, length) < 0) {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000248 PyMem_DEL(unicode->str);
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000249 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000250 }
251 }
Guido van Rossumad98db12001-06-14 17:52:02 +0000252 else {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000253 unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
Guido van Rossumad98db12001-06-14 17:52:02 +0000254 }
255 PyObject_INIT(unicode, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000256 }
257 else {
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000258 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000259 if (unicode == NULL)
260 return NULL;
261 unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
262 }
263
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000264 if (!unicode->str) {
265 PyErr_NoMemory();
Barry Warsaw51ac5802000-03-20 16:36:48 +0000266 goto onError;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000267 }
Jeremy Hyltond8082792003-09-16 19:41:39 +0000268 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +0000269 * the caller fails before initializing str -- unicode_resize()
270 * reads str[0], and the Keep-Alive optimization can keep memory
271 * allocated for str alive across a call to unicode_dealloc(unicode).
272 * We don't want unicode_resize to read uninitialized memory in
273 * that case.
274 */
Jeremy Hyltond8082792003-09-16 19:41:39 +0000275 unicode->str[0] = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000276 unicode->str[length] = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000277 unicode->length = length;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000278 unicode->hash = -1;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000279 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000280 return unicode;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000281
282 onError:
283 _Py_ForgetReference((PyObject *)unicode);
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000284 PyObject_Del(unicode);
Barry Warsaw51ac5802000-03-20 16:36:48 +0000285 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000286}
287
288static
Guido van Rossum9475a232001-10-05 20:51:39 +0000289void unicode_dealloc(register PyUnicodeObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000290{
Guido van Rossum604ddf82001-12-06 20:03:56 +0000291 if (PyUnicode_CheckExact(unicode) &&
292 unicode_freelist_size < MAX_UNICODE_FREELIST_SIZE) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000293 /* Keep-Alive optimization */
294 if (unicode->length >= KEEPALIVE_SIZE_LIMIT) {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000295 PyMem_DEL(unicode->str);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000296 unicode->str = NULL;
297 unicode->length = 0;
298 }
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000299 if (unicode->defenc) {
300 Py_DECREF(unicode->defenc);
301 unicode->defenc = NULL;
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000302 }
303 /* Add to free list */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000304 *(PyUnicodeObject **)unicode = unicode_freelist;
305 unicode_freelist = unicode;
306 unicode_freelist_size++;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000307 }
308 else {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000309 PyMem_DEL(unicode->str);
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000310 Py_XDECREF(unicode->defenc);
Guido van Rossum604ddf82001-12-06 20:03:56 +0000311 unicode->ob_type->tp_free((PyObject *)unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000312 }
313}
314
Martin v. Löwis18e16552006-02-15 17:27:45 +0000315int PyUnicode_Resize(PyObject **unicode, Py_ssize_t length)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000316{
317 register PyUnicodeObject *v;
318
319 /* Argument checks */
320 if (unicode == NULL) {
321 PyErr_BadInternalCall();
322 return -1;
323 }
324 v = (PyUnicodeObject *)*unicode;
Guido van Rossum049cd6b2002-10-11 00:43:48 +0000325 if (v == NULL || !PyUnicode_Check(v) || v->ob_refcnt != 1 || length < 0) {
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000326 PyErr_BadInternalCall();
327 return -1;
328 }
329
330 /* Resizing unicode_empty and single character objects is not
331 possible since these are being shared. We simply return a fresh
332 copy with the same Unicode content. */
Tim Petersced69f82003-09-16 20:30:58 +0000333 if (v->length != length &&
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000334 (v == unicode_empty || v->length == 1)) {
335 PyUnicodeObject *w = _PyUnicode_New(length);
336 if (w == NULL)
337 return -1;
338 Py_UNICODE_COPY(w->str, v->str,
339 length < v->length ? length : v->length);
Raymond Hettingerc8df5782003-03-09 07:30:43 +0000340 Py_DECREF(*unicode);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000341 *unicode = (PyObject *)w;
342 return 0;
343 }
344
345 /* Note that we don't have to modify *unicode for unshared Unicode
346 objects, since we can modify them in-place. */
347 return unicode_resize(v, length);
348}
349
350/* Internal API for use in unicodeobject.c only ! */
351#define _PyUnicode_Resize(unicodevar, length) \
352 PyUnicode_Resize(((PyObject **)(unicodevar)), length)
353
Guido van Rossumd57fd912000-03-10 22:53:23 +0000354PyObject *PyUnicode_FromUnicode(const Py_UNICODE *u,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000355 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000356{
357 PyUnicodeObject *unicode;
358
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000359 /* If the Unicode data is known at construction time, we can apply
360 some optimizations which share commonly used objects. */
361 if (u != NULL) {
362
363 /* Optimization for empty strings */
364 if (size == 0 && unicode_empty != NULL) {
365 Py_INCREF(unicode_empty);
366 return (PyObject *)unicode_empty;
367 }
368
369 /* Single character Unicode objects in the Latin-1 range are
370 shared when using this constructor */
371 if (size == 1 && *u < 256) {
372 unicode = unicode_latin1[*u];
373 if (!unicode) {
374 unicode = _PyUnicode_New(1);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000375 if (!unicode)
376 return NULL;
Marc-André Lemburg8879a332001-06-07 12:26:56 +0000377 unicode->str[0] = *u;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000378 unicode_latin1[*u] = unicode;
379 }
380 Py_INCREF(unicode);
381 return (PyObject *)unicode;
382 }
383 }
Tim Petersced69f82003-09-16 20:30:58 +0000384
Guido van Rossumd57fd912000-03-10 22:53:23 +0000385 unicode = _PyUnicode_New(size);
386 if (!unicode)
387 return NULL;
388
389 /* Copy the Unicode data into the new object */
390 if (u != NULL)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000391 Py_UNICODE_COPY(unicode->str, u, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000392
393 return (PyObject *)unicode;
394}
395
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000396PyObject *PyUnicode_FromString(const char *u)
397{
398 PyUnicodeObject *unicode;
399 Py_ssize_t size = strlen(u);
400
401 /* If the Unicode data is known at construction time, we can apply
402 some optimizations which share commonly used objects. */
403 if (u != NULL) {
404
405 /* Optimization for empty strings */
406 if (size == 0 && unicode_empty != NULL) {
407 Py_INCREF(unicode_empty);
408 return (PyObject *)unicode_empty;
409 }
410
Walter Dörwald071b9da2007-05-05 14:21:20 +0000411 /* Single characters are shared when using this constructor */
412 if (size == 1) {
Walter Dörwaldce32db32007-05-05 14:26:59 +0000413 unicode = unicode_latin1[(int)*u];
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000414 if (!unicode) {
415 unicode = _PyUnicode_New(1);
416 if (!unicode)
417 return NULL;
418 unicode->str[0] = *u;
Walter Dörwaldce32db32007-05-05 14:26:59 +0000419 unicode_latin1[(int)*u] = unicode;
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000420 }
421 Py_INCREF(unicode);
422 return (PyObject *)unicode;
423 }
424 }
425
426 unicode = _PyUnicode_New(size);
427 if (!unicode)
428 return NULL;
429
430 /* Copy the Unicode data into the new object */
431 if (u != NULL) {
Walter Dörwaldce32db32007-05-05 14:26:59 +0000432 Py_UNICODE *p = unicode->str;
433 while ((*p++ = *u++))
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000434 ;
435 }
436
437 return (PyObject *)unicode;
438}
439
Guido van Rossumd57fd912000-03-10 22:53:23 +0000440#ifdef HAVE_WCHAR_H
441
442PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000443 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000444{
445 PyUnicodeObject *unicode;
446
447 if (w == NULL) {
448 PyErr_BadInternalCall();
449 return NULL;
450 }
451
452 unicode = _PyUnicode_New(size);
453 if (!unicode)
454 return NULL;
455
456 /* Copy the wchar_t data into the new object */
457#ifdef HAVE_USABLE_WCHAR_T
458 memcpy(unicode->str, w, size * sizeof(wchar_t));
Tim Petersced69f82003-09-16 20:30:58 +0000459#else
Guido van Rossumd57fd912000-03-10 22:53:23 +0000460 {
461 register Py_UNICODE *u;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000462 register Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000463 u = PyUnicode_AS_UNICODE(unicode);
Marc-André Lemburg204bd6d2004-10-15 07:45:05 +0000464 for (i = size; i > 0; i--)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000465 *u++ = *w++;
466 }
467#endif
468
469 return (PyObject *)unicode;
470}
471
Martin v. Löwis18e16552006-02-15 17:27:45 +0000472Py_ssize_t PyUnicode_AsWideChar(PyUnicodeObject *unicode,
473 wchar_t *w,
474 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000475{
476 if (unicode == NULL) {
477 PyErr_BadInternalCall();
478 return -1;
479 }
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +0000480
481 /* If possible, try to copy the 0-termination as well */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000482 if (size > PyUnicode_GET_SIZE(unicode))
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +0000483 size = PyUnicode_GET_SIZE(unicode) + 1;
484
Guido van Rossumd57fd912000-03-10 22:53:23 +0000485#ifdef HAVE_USABLE_WCHAR_T
486 memcpy(w, unicode->str, size * sizeof(wchar_t));
487#else
488 {
489 register Py_UNICODE *u;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000490 register Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000491 u = PyUnicode_AS_UNICODE(unicode);
Marc-André Lemburg204bd6d2004-10-15 07:45:05 +0000492 for (i = size; i > 0; i--)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000493 *w++ = *u++;
494 }
495#endif
496
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +0000497 if (size > PyUnicode_GET_SIZE(unicode))
498 return PyUnicode_GET_SIZE(unicode);
499 else
Guido van Rossumd57fd912000-03-10 22:53:23 +0000500 return size;
501}
502
503#endif
504
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000505PyObject *PyUnicode_FromOrdinal(int ordinal)
506{
Hye-Shik Chang40574832004-04-06 07:24:51 +0000507 Py_UNICODE s[1];
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000508
509#ifdef Py_UNICODE_WIDE
510 if (ordinal < 0 || ordinal > 0x10ffff) {
511 PyErr_SetString(PyExc_ValueError,
512 "unichr() arg not in range(0x110000) "
513 "(wide Python build)");
514 return NULL;
515 }
516#else
517 if (ordinal < 0 || ordinal > 0xffff) {
518 PyErr_SetString(PyExc_ValueError,
519 "unichr() arg not in range(0x10000) "
520 "(narrow Python build)");
521 return NULL;
522 }
523#endif
524
Hye-Shik Chang40574832004-04-06 07:24:51 +0000525 s[0] = (Py_UNICODE)ordinal;
526 return PyUnicode_FromUnicode(s, 1);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000527}
528
Guido van Rossumd57fd912000-03-10 22:53:23 +0000529PyObject *PyUnicode_FromObject(register PyObject *obj)
530{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000531 /* XXX Perhaps we should make this API an alias of
532 PyObject_Unicode() instead ?! */
533 if (PyUnicode_CheckExact(obj)) {
534 Py_INCREF(obj);
535 return obj;
536 }
537 if (PyUnicode_Check(obj)) {
538 /* For a Unicode subtype that's not a Unicode object,
539 return a true Unicode object with the same data. */
540 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(obj),
541 PyUnicode_GET_SIZE(obj));
542 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000543 return PyUnicode_FromEncodedObject(obj, NULL, "strict");
544}
545
546PyObject *PyUnicode_FromEncodedObject(register PyObject *obj,
547 const char *encoding,
548 const char *errors)
549{
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000550 const char *s = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +0000551 Py_ssize_t len;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000552 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +0000553
Guido van Rossumd57fd912000-03-10 22:53:23 +0000554 if (obj == NULL) {
555 PyErr_BadInternalCall();
556 return NULL;
557 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000558
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000559#if 0
560 /* For b/w compatibility we also accept Unicode objects provided
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +0000561 that no encodings is given and then redirect to
562 PyObject_Unicode() which then applies the additional logic for
563 Unicode subclasses.
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000564
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000565 NOTE: This API should really only be used for object which
566 represent *encoded* Unicode !
567
568 */
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000569 if (PyUnicode_Check(obj)) {
570 if (encoding) {
571 PyErr_SetString(PyExc_TypeError,
572 "decoding Unicode is not supported");
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000573 return NULL;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000574 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000575 return PyObject_Unicode(obj);
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000576 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000577#else
578 if (PyUnicode_Check(obj)) {
579 PyErr_SetString(PyExc_TypeError,
580 "decoding Unicode is not supported");
581 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000582 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000583#endif
584
585 /* Coerce object */
586 if (PyString_Check(obj)) {
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000587 s = PyString_AS_STRING(obj);
588 len = PyString_GET_SIZE(obj);
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000589 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000590 else if (PyObject_AsCharBuffer(obj, &s, &len)) {
591 /* Overwrite the error message with something more useful in
592 case of a TypeError. */
593 if (PyErr_ExceptionMatches(PyExc_TypeError))
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000594 PyErr_Format(PyExc_TypeError,
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000595 "coercing to Unicode: need string or buffer, "
596 "%.80s found",
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000597 obj->ob_type->tp_name);
598 goto onError;
599 }
Tim Petersced69f82003-09-16 20:30:58 +0000600
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000601 /* Convert to Unicode */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000602 if (len == 0) {
603 Py_INCREF(unicode_empty);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000604 v = (PyObject *)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000605 }
Tim Petersced69f82003-09-16 20:30:58 +0000606 else
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000607 v = PyUnicode_Decode(s, len, encoding, errors);
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +0000608
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000609 return v;
610
611 onError:
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000612 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000613}
614
615PyObject *PyUnicode_Decode(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000616 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +0000617 const char *encoding,
618 const char *errors)
619{
620 PyObject *buffer = NULL, *unicode;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000621
622 if (encoding == NULL)
Fred Drakee4315f52000-05-09 19:53:39 +0000623 encoding = PyUnicode_GetDefaultEncoding();
624
625 /* Shortcuts for common default encodings */
626 if (strcmp(encoding, "utf-8") == 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000627 return PyUnicode_DecodeUTF8(s, size, errors);
Fred Drakee4315f52000-05-09 19:53:39 +0000628 else if (strcmp(encoding, "latin-1") == 0)
629 return PyUnicode_DecodeLatin1(s, size, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +0000630#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
631 else if (strcmp(encoding, "mbcs") == 0)
632 return PyUnicode_DecodeMBCS(s, size, errors);
633#endif
Fred Drakee4315f52000-05-09 19:53:39 +0000634 else if (strcmp(encoding, "ascii") == 0)
635 return PyUnicode_DecodeASCII(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000636
637 /* Decode via the codec registry */
638 buffer = PyBuffer_FromMemory((void *)s, size);
639 if (buffer == NULL)
640 goto onError;
641 unicode = PyCodec_Decode(buffer, encoding, errors);
642 if (unicode == NULL)
643 goto onError;
644 if (!PyUnicode_Check(unicode)) {
645 PyErr_Format(PyExc_TypeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000646 "decoder did not return an unicode object (type=%.400s)",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000647 unicode->ob_type->tp_name);
648 Py_DECREF(unicode);
649 goto onError;
650 }
651 Py_DECREF(buffer);
652 return unicode;
Tim Petersced69f82003-09-16 20:30:58 +0000653
Guido van Rossumd57fd912000-03-10 22:53:23 +0000654 onError:
655 Py_XDECREF(buffer);
656 return NULL;
657}
658
Marc-André Lemburgd2d45982004-07-08 17:57:32 +0000659PyObject *PyUnicode_AsDecodedObject(PyObject *unicode,
660 const char *encoding,
661 const char *errors)
662{
663 PyObject *v;
664
665 if (!PyUnicode_Check(unicode)) {
666 PyErr_BadArgument();
667 goto onError;
668 }
669
670 if (encoding == NULL)
671 encoding = PyUnicode_GetDefaultEncoding();
672
673 /* Decode via the codec registry */
674 v = PyCodec_Decode(unicode, encoding, errors);
675 if (v == NULL)
676 goto onError;
677 return v;
678
679 onError:
680 return NULL;
681}
682
Guido van Rossumd57fd912000-03-10 22:53:23 +0000683PyObject *PyUnicode_Encode(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000684 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +0000685 const char *encoding,
686 const char *errors)
687{
688 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +0000689
Guido van Rossumd57fd912000-03-10 22:53:23 +0000690 unicode = PyUnicode_FromUnicode(s, size);
691 if (unicode == NULL)
692 return NULL;
693 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
694 Py_DECREF(unicode);
695 return v;
696}
697
Marc-André Lemburgd2d45982004-07-08 17:57:32 +0000698PyObject *PyUnicode_AsEncodedObject(PyObject *unicode,
699 const char *encoding,
700 const char *errors)
701{
702 PyObject *v;
703
704 if (!PyUnicode_Check(unicode)) {
705 PyErr_BadArgument();
706 goto onError;
707 }
708
709 if (encoding == NULL)
710 encoding = PyUnicode_GetDefaultEncoding();
711
712 /* Encode via the codec registry */
713 v = PyCodec_Encode(unicode, encoding, errors);
714 if (v == NULL)
715 goto onError;
716 return v;
717
718 onError:
719 return NULL;
720}
721
Guido van Rossumd57fd912000-03-10 22:53:23 +0000722PyObject *PyUnicode_AsEncodedString(PyObject *unicode,
723 const char *encoding,
724 const char *errors)
725{
726 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +0000727
Guido van Rossumd57fd912000-03-10 22:53:23 +0000728 if (!PyUnicode_Check(unicode)) {
729 PyErr_BadArgument();
730 goto onError;
731 }
Fred Drakee4315f52000-05-09 19:53:39 +0000732
Tim Petersced69f82003-09-16 20:30:58 +0000733 if (encoding == NULL)
Fred Drakee4315f52000-05-09 19:53:39 +0000734 encoding = PyUnicode_GetDefaultEncoding();
735
736 /* Shortcuts for common default encodings */
737 if (errors == NULL) {
738 if (strcmp(encoding, "utf-8") == 0)
Jeremy Hylton9cea41c2001-05-29 17:13:15 +0000739 return PyUnicode_AsUTF8String(unicode);
Fred Drakee4315f52000-05-09 19:53:39 +0000740 else if (strcmp(encoding, "latin-1") == 0)
741 return PyUnicode_AsLatin1String(unicode);
Mark Hammond0ccda1e2003-07-01 00:13:27 +0000742#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
743 else if (strcmp(encoding, "mbcs") == 0)
744 return PyUnicode_AsMBCSString(unicode);
745#endif
Fred Drakee4315f52000-05-09 19:53:39 +0000746 else if (strcmp(encoding, "ascii") == 0)
747 return PyUnicode_AsASCIIString(unicode);
748 }
Guido van Rossumd57fd912000-03-10 22:53:23 +0000749
750 /* Encode via the codec registry */
751 v = PyCodec_Encode(unicode, encoding, errors);
752 if (v == NULL)
753 goto onError;
Guido van Rossumf15a29f2007-05-04 00:41:39 +0000754 if (!PyBytes_Check(v)) {
755 if (PyString_Check(v)) {
756 /* Old codec, turn it into bytes */
757 PyObject *b = PyBytes_FromObject(v);
758 Py_DECREF(v);
759 return b;
760 }
Guido van Rossumd57fd912000-03-10 22:53:23 +0000761 PyErr_Format(PyExc_TypeError,
Guido van Rossumf15a29f2007-05-04 00:41:39 +0000762 "encoder did not return a bytes object "
763 "(type=%.400s, encoding=%.20s, errors=%.20s)",
764 v->ob_type->tp_name,
765 encoding ? encoding : "NULL",
766 errors ? errors : "NULL");
Guido van Rossumd57fd912000-03-10 22:53:23 +0000767 Py_DECREF(v);
768 goto onError;
769 }
770 return v;
Tim Petersced69f82003-09-16 20:30:58 +0000771
Guido van Rossumd57fd912000-03-10 22:53:23 +0000772 onError:
773 return NULL;
774}
775
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000776PyObject *_PyUnicode_AsDefaultEncodedString(PyObject *unicode,
777 const char *errors)
778{
779 PyObject *v = ((PyUnicodeObject *)unicode)->defenc;
Guido van Rossumf15a29f2007-05-04 00:41:39 +0000780 PyObject *b;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000781 if (v)
782 return v;
Guido van Rossumf15a29f2007-05-04 00:41:39 +0000783 if (errors != NULL)
784 Py_FatalError("non-NULL encoding in _PyUnicode_AsDefaultEncodedString");
785 if (errors == NULL) {
786 b = PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
787 PyUnicode_GET_SIZE(unicode),
788 NULL);
789 }
790 else {
791 b = PyUnicode_AsEncodedString(unicode, NULL, errors);
792 }
793 if (!b)
794 return NULL;
795 v = PyString_FromStringAndSize(PyBytes_AsString(b),
796 PyBytes_Size(b));
797 Py_DECREF(b);
798 if (!errors) {
799 Py_XINCREF(v);
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000800 ((PyUnicodeObject *)unicode)->defenc = v;
Guido van Rossumf15a29f2007-05-04 00:41:39 +0000801 }
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000802 return v;
803}
804
Guido van Rossumd57fd912000-03-10 22:53:23 +0000805Py_UNICODE *PyUnicode_AsUnicode(PyObject *unicode)
806{
807 if (!PyUnicode_Check(unicode)) {
808 PyErr_BadArgument();
809 goto onError;
810 }
811 return PyUnicode_AS_UNICODE(unicode);
812
813 onError:
814 return NULL;
815}
816
Martin v. Löwis18e16552006-02-15 17:27:45 +0000817Py_ssize_t PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000818{
819 if (!PyUnicode_Check(unicode)) {
820 PyErr_BadArgument();
821 goto onError;
822 }
823 return PyUnicode_GET_SIZE(unicode);
824
825 onError:
826 return -1;
827}
828
Thomas Wouters78890102000-07-22 19:25:51 +0000829const char *PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +0000830{
831 return unicode_default_encoding;
832}
833
834int PyUnicode_SetDefaultEncoding(const char *encoding)
835{
Guido van Rossumf15a29f2007-05-04 00:41:39 +0000836 if (strcmp(encoding, unicode_default_encoding) != 0) {
837 PyErr_Format(PyExc_ValueError,
838 "Can only set default encoding to %s",
839 unicode_default_encoding);
840 return -1;
841 }
Fred Drakee4315f52000-05-09 19:53:39 +0000842 return 0;
Fred Drakee4315f52000-05-09 19:53:39 +0000843}
844
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000845/* error handling callback helper:
846 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +0000847 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000848 and adjust various state variables.
849 return 0 on success, -1 on error
850*/
851
852static
853int unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
854 const char *encoding, const char *reason,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000855 const char *input, Py_ssize_t insize, Py_ssize_t *startinpos, Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
856 PyObject **output, Py_ssize_t *outpos, Py_UNICODE **outptr)
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000857{
Martin v. Löwis18e16552006-02-15 17:27:45 +0000858 static char *argparse = "O!n;decoding error handler must return (unicode, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000859
860 PyObject *restuple = NULL;
861 PyObject *repunicode = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +0000862 Py_ssize_t outsize = PyUnicode_GET_SIZE(*output);
863 Py_ssize_t requiredsize;
864 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000865 Py_UNICODE *repptr;
Martin v. Löwis18e16552006-02-15 17:27:45 +0000866 Py_ssize_t repsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000867 int res = -1;
868
869 if (*errorHandler == NULL) {
870 *errorHandler = PyCodec_LookupError(errors);
871 if (*errorHandler == NULL)
872 goto onError;
873 }
874
875 if (*exceptionObject == NULL) {
876 *exceptionObject = PyUnicodeDecodeError_Create(
877 encoding, input, insize, *startinpos, *endinpos, reason);
878 if (*exceptionObject == NULL)
879 goto onError;
880 }
881 else {
882 if (PyUnicodeDecodeError_SetStart(*exceptionObject, *startinpos))
883 goto onError;
884 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, *endinpos))
885 goto onError;
886 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
887 goto onError;
888 }
889
890 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
891 if (restuple == NULL)
892 goto onError;
893 if (!PyTuple_Check(restuple)) {
894 PyErr_Format(PyExc_TypeError, &argparse[4]);
895 goto onError;
896 }
897 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
898 goto onError;
899 if (newpos<0)
Walter Dörwald2e0b18a2003-01-31 17:19:08 +0000900 newpos = insize+newpos;
901 if (newpos<0 || newpos>insize) {
Martin v. Löwis2c95cc62006-02-16 06:54:25 +0000902 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
Walter Dörwald2e0b18a2003-01-31 17:19:08 +0000903 goto onError;
904 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000905
906 /* need more space? (at least enough for what we
907 have+the replacement+the rest of the string (starting
908 at the new input position), so we won't have to check space
909 when there are no errors in the rest of the string) */
910 repptr = PyUnicode_AS_UNICODE(repunicode);
911 repsize = PyUnicode_GET_SIZE(repunicode);
912 requiredsize = *outpos + repsize + insize-newpos;
913 if (requiredsize > outsize) {
914 if (requiredsize<2*outsize)
915 requiredsize = 2*outsize;
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +0000916 if (PyUnicode_Resize(output, requiredsize) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000917 goto onError;
918 *outptr = PyUnicode_AS_UNICODE(*output) + *outpos;
919 }
920 *endinpos = newpos;
921 *inptr = input + newpos;
922 Py_UNICODE_COPY(*outptr, repptr, repsize);
923 *outptr += repsize;
924 *outpos += repsize;
925 /* we made it! */
926 res = 0;
927
928 onError:
929 Py_XDECREF(restuple);
930 return res;
931}
932
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000933/* --- UTF-7 Codec -------------------------------------------------------- */
934
935/* see RFC2152 for details */
936
Tim Petersced69f82003-09-16 20:30:58 +0000937static
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000938char utf7_special[128] = {
939 /* indicate whether a UTF-7 character is special i.e. cannot be directly
940 encoded:
941 0 - not special
942 1 - special
943 2 - whitespace (optional)
944 3 - RFC2152 Set O (optional) */
945 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 2, 1, 1,
946 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
947 2, 3, 3, 3, 3, 3, 3, 0, 0, 0, 3, 1, 0, 0, 0, 1,
948 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 0,
949 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
950 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 1, 3, 3, 3,
951 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
952 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 1, 1,
953
954};
955
Marc-André Lemburge115ec82005-10-19 22:33:31 +0000956/* Note: The comparison (c) <= 0 is a trick to work-around gcc
957 warnings about the comparison always being false; since
958 utf7_special[0] is 1, we can safely make that one comparison
959 true */
960
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000961#define SPECIAL(c, encodeO, encodeWS) \
Marc-André Lemburge115ec82005-10-19 22:33:31 +0000962 ((c) > 127 || (c) <= 0 || utf7_special[(c)] == 1 || \
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +0000963 (encodeWS && (utf7_special[(c)] == 2)) || \
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000964 (encodeO && (utf7_special[(c)] == 3)))
965
Marc-André Lemburge115ec82005-10-19 22:33:31 +0000966#define B64(n) \
967 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
968#define B64CHAR(c) \
969 (isalnum(c) || (c) == '+' || (c) == '/')
970#define UB64(c) \
971 ((c) == '+' ? 62 : (c) == '/' ? 63 : (c) >= 'a' ? \
972 (c) - 71 : (c) >= 'A' ? (c) - 65 : (c) + 4 )
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000973
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +0000974#define ENCODE(out, ch, bits) \
975 while (bits >= 6) { \
976 *out++ = B64(ch >> (bits-6)); \
977 bits -= 6; \
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000978 }
979
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +0000980#define DECODE(out, ch, bits, surrogate) \
981 while (bits >= 16) { \
982 Py_UNICODE outCh = (Py_UNICODE) ((ch >> (bits-16)) & 0xffff); \
983 bits -= 16; \
984 if (surrogate) { \
Marc-André Lemburge115ec82005-10-19 22:33:31 +0000985 /* We have already generated an error for the high surrogate \
986 so let's not bother seeing if the low surrogate is correct or not */ \
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +0000987 surrogate = 0; \
988 } else if (0xDC00 <= outCh && outCh <= 0xDFFF) { \
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000989 /* This is a surrogate pair. Unfortunately we can't represent \
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +0000990 it in a 16-bit character */ \
991 surrogate = 1; \
992 errmsg = "code pairs are not supported"; \
993 goto utf7Error; \
994 } else { \
995 *out++ = outCh; \
996 } \
Marc-André Lemburge115ec82005-10-19 22:33:31 +0000997 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000998
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000999PyObject *PyUnicode_DecodeUTF7(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001000 Py_ssize_t size,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001001 const char *errors)
1002{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001003 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001004 Py_ssize_t startinpos;
1005 Py_ssize_t endinpos;
1006 Py_ssize_t outpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001007 const char *e;
1008 PyUnicodeObject *unicode;
1009 Py_UNICODE *p;
1010 const char *errmsg = "";
1011 int inShift = 0;
1012 unsigned int bitsleft = 0;
1013 unsigned long charsleft = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001014 int surrogate = 0;
1015 PyObject *errorHandler = NULL;
1016 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001017
1018 unicode = _PyUnicode_New(size);
1019 if (!unicode)
1020 return NULL;
1021 if (size == 0)
1022 return (PyObject *)unicode;
1023
1024 p = unicode->str;
1025 e = s + size;
1026
1027 while (s < e) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001028 Py_UNICODE ch;
1029 restart:
1030 ch = *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001031
1032 if (inShift) {
1033 if ((ch == '-') || !B64CHAR(ch)) {
1034 inShift = 0;
1035 s++;
Tim Petersced69f82003-09-16 20:30:58 +00001036
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001037 /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
1038 if (bitsleft >= 6) {
1039 /* The shift sequence has a partial character in it. If
1040 bitsleft < 6 then we could just classify it as padding
1041 but that is not the case here */
1042
1043 errmsg = "partial character in shift sequence";
Tim Petersced69f82003-09-16 20:30:58 +00001044 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001045 }
1046 /* According to RFC2152 the remaining bits should be zero. We
Tim Petersced69f82003-09-16 20:30:58 +00001047 choose to signal an error/insert a replacement character
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001048 here so indicate the potential of a misencoded character. */
1049
1050 /* On x86, a << b == a << (b%32) so make sure that bitsleft != 0 */
1051 if (bitsleft && charsleft << (sizeof(charsleft) * 8 - bitsleft)) {
1052 errmsg = "non-zero padding bits in shift sequence";
Tim Petersced69f82003-09-16 20:30:58 +00001053 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001054 }
1055
1056 if (ch == '-') {
1057 if ((s < e) && (*(s) == '-')) {
Tim Petersced69f82003-09-16 20:30:58 +00001058 *p++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001059 inShift = 1;
1060 }
1061 } else if (SPECIAL(ch,0,0)) {
1062 errmsg = "unexpected special character";
Tim Petersced69f82003-09-16 20:30:58 +00001063 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001064 } else {
1065 *p++ = ch;
1066 }
1067 } else {
1068 charsleft = (charsleft << 6) | UB64(ch);
1069 bitsleft += 6;
1070 s++;
1071 /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
1072 }
1073 }
1074 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001075 startinpos = s-starts;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001076 s++;
1077 if (s < e && *s == '-') {
1078 s++;
1079 *p++ = '+';
1080 } else
1081 {
1082 inShift = 1;
1083 bitsleft = 0;
1084 }
1085 }
1086 else if (SPECIAL(ch,0,0)) {
1087 errmsg = "unexpected special character";
1088 s++;
Tim Petersced69f82003-09-16 20:30:58 +00001089 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001090 }
1091 else {
1092 *p++ = ch;
1093 s++;
1094 }
1095 continue;
1096 utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001097 outpos = p-PyUnicode_AS_UNICODE(unicode);
1098 endinpos = s-starts;
1099 if (unicode_decode_call_errorhandler(
1100 errors, &errorHandler,
1101 "utf7", errmsg,
1102 starts, size, &startinpos, &endinpos, &exc, &s,
1103 (PyObject **)&unicode, &outpos, &p))
1104 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001105 }
1106
1107 if (inShift) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001108 outpos = p-PyUnicode_AS_UNICODE(unicode);
1109 endinpos = size;
1110 if (unicode_decode_call_errorhandler(
1111 errors, &errorHandler,
1112 "utf7", "unterminated shift sequence",
1113 starts, size, &startinpos, &endinpos, &exc, &s,
1114 (PyObject **)&unicode, &outpos, &p))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001115 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001116 if (s < e)
1117 goto restart;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001118 }
1119
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00001120 if (_PyUnicode_Resize(&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001121 goto onError;
1122
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001123 Py_XDECREF(errorHandler);
1124 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001125 return (PyObject *)unicode;
1126
1127onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001128 Py_XDECREF(errorHandler);
1129 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001130 Py_DECREF(unicode);
1131 return NULL;
1132}
1133
1134
1135PyObject *PyUnicode_EncodeUTF7(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001136 Py_ssize_t size,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001137 int encodeSetO,
1138 int encodeWhiteSpace,
1139 const char *errors)
1140{
1141 PyObject *v;
1142 /* It might be possible to tighten this worst case */
Martin v. Löwis18e16552006-02-15 17:27:45 +00001143 Py_ssize_t cbAllocated = 5 * size;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001144 int inShift = 0;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001145 Py_ssize_t i = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001146 unsigned int bitsleft = 0;
1147 unsigned long charsleft = 0;
1148 char * out;
1149 char * start;
1150
1151 if (size == 0)
Walter Dörwald51ab4142007-05-05 14:43:36 +00001152 return PyBytes_FromStringAndSize(NULL, 0);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001153
Walter Dörwald51ab4142007-05-05 14:43:36 +00001154 v = PyBytes_FromStringAndSize(NULL, cbAllocated);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001155 if (v == NULL)
1156 return NULL;
1157
Walter Dörwald51ab4142007-05-05 14:43:36 +00001158 start = out = PyBytes_AS_STRING(v);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001159 for (;i < size; ++i) {
1160 Py_UNICODE ch = s[i];
1161
1162 if (!inShift) {
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001163 if (ch == '+') {
1164 *out++ = '+';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001165 *out++ = '-';
1166 } else if (SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
1167 charsleft = ch;
1168 bitsleft = 16;
1169 *out++ = '+';
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001170 /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001171 inShift = bitsleft > 0;
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001172 } else {
1173 *out++ = (char) ch;
1174 }
1175 } else {
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001176 if (!SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
1177 *out++ = B64(charsleft << (6-bitsleft));
1178 charsleft = 0;
1179 bitsleft = 0;
1180 /* Characters not in the BASE64 set implicitly unshift the sequence
1181 so no '-' is required, except if the character is itself a '-' */
1182 if (B64CHAR(ch) || ch == '-') {
1183 *out++ = '-';
1184 }
1185 inShift = 0;
1186 *out++ = (char) ch;
1187 } else {
1188 bitsleft += 16;
1189 charsleft = (charsleft << 16) | ch;
1190 /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
1191
1192 /* If the next character is special then we dont' need to terminate
Tim Petersced69f82003-09-16 20:30:58 +00001193 the shift sequence. If the next character is not a BASE64 character
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001194 or '-' then the shift sequence will be terminated implicitly and we
1195 don't have to insert a '-'. */
1196
1197 if (bitsleft == 0) {
1198 if (i + 1 < size) {
1199 Py_UNICODE ch2 = s[i+1];
1200
1201 if (SPECIAL(ch2, encodeSetO, encodeWhiteSpace)) {
Tim Petersced69f82003-09-16 20:30:58 +00001202
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001203 } else if (B64CHAR(ch2) || ch2 == '-') {
1204 *out++ = '-';
1205 inShift = 0;
1206 } else {
1207 inShift = 0;
1208 }
1209
1210 }
1211 else {
1212 *out++ = '-';
1213 inShift = 0;
1214 }
1215 }
Tim Petersced69f82003-09-16 20:30:58 +00001216 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001217 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001218 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001219 if (bitsleft) {
1220 *out++= B64(charsleft << (6-bitsleft) );
1221 *out++ = '-';
1222 }
1223
Walter Dörwald51ab4142007-05-05 14:43:36 +00001224 if (PyBytes_Resize(v, out - start)) {
1225 Py_DECREF(v);
1226 return NULL;
1227 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001228 return v;
1229}
1230
1231#undef SPECIAL
1232#undef B64
1233#undef B64CHAR
1234#undef UB64
1235#undef ENCODE
1236#undef DECODE
1237
Guido van Rossumd57fd912000-03-10 22:53:23 +00001238/* --- UTF-8 Codec -------------------------------------------------------- */
1239
Tim Petersced69f82003-09-16 20:30:58 +00001240static
Guido van Rossumd57fd912000-03-10 22:53:23 +00001241char utf8_code_length[256] = {
1242 /* Map UTF-8 encoded prefix byte to sequence length. zero means
1243 illegal prefix. see RFC 2279 for details */
1244 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1245 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1246 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1247 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1248 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1249 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1250 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1251 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1252 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1253 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1254 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1255 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1256 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1257 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1258 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
1259 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0
1260};
1261
Guido van Rossumd57fd912000-03-10 22:53:23 +00001262PyObject *PyUnicode_DecodeUTF8(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001263 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00001264 const char *errors)
1265{
Walter Dörwald69652032004-09-07 20:24:22 +00001266 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
1267}
1268
1269PyObject *PyUnicode_DecodeUTF8Stateful(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001270 Py_ssize_t size,
Walter Dörwald69652032004-09-07 20:24:22 +00001271 const char *errors,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001272 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00001273{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001274 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001275 int n;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001276 Py_ssize_t startinpos;
1277 Py_ssize_t endinpos;
1278 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001279 const char *e;
1280 PyUnicodeObject *unicode;
1281 Py_UNICODE *p;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001282 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001283 PyObject *errorHandler = NULL;
1284 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001285
1286 /* Note: size will always be longer than the resulting Unicode
1287 character count */
1288 unicode = _PyUnicode_New(size);
1289 if (!unicode)
1290 return NULL;
Walter Dörwald69652032004-09-07 20:24:22 +00001291 if (size == 0) {
1292 if (consumed)
1293 *consumed = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001294 return (PyObject *)unicode;
Walter Dörwald69652032004-09-07 20:24:22 +00001295 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001296
1297 /* Unpack UTF-8 encoded data */
1298 p = unicode->str;
1299 e = s + size;
1300
1301 while (s < e) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001302 Py_UCS4 ch = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001303
1304 if (ch < 0x80) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001305 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001306 s++;
1307 continue;
1308 }
1309
1310 n = utf8_code_length[ch];
1311
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001312 if (s + n > e) {
Walter Dörwald69652032004-09-07 20:24:22 +00001313 if (consumed)
1314 break;
1315 else {
1316 errmsg = "unexpected end of data";
1317 startinpos = s-starts;
1318 endinpos = size;
1319 goto utf8Error;
1320 }
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001321 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001322
1323 switch (n) {
1324
1325 case 0:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001326 errmsg = "unexpected code byte";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001327 startinpos = s-starts;
1328 endinpos = startinpos+1;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001329 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001330
1331 case 1:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001332 errmsg = "internal error";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001333 startinpos = s-starts;
1334 endinpos = startinpos+1;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001335 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001336
1337 case 2:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001338 if ((s[1] & 0xc0) != 0x80) {
1339 errmsg = "invalid data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001340 startinpos = s-starts;
1341 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001342 goto utf8Error;
1343 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001344 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001345 if (ch < 0x80) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001346 startinpos = s-starts;
1347 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001348 errmsg = "illegal encoding";
1349 goto utf8Error;
1350 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001351 else
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001352 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001353 break;
1354
1355 case 3:
Tim Petersced69f82003-09-16 20:30:58 +00001356 if ((s[1] & 0xc0) != 0x80 ||
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001357 (s[2] & 0xc0) != 0x80) {
1358 errmsg = "invalid data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001359 startinpos = s-starts;
1360 endinpos = startinpos+3;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001361 goto utf8Error;
1362 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001363 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001364 if (ch < 0x0800) {
1365 /* Note: UTF-8 encodings of surrogates are considered
Tim Petersced69f82003-09-16 20:30:58 +00001366 legal UTF-8 sequences;
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001367
1368 XXX For wide builds (UCS-4) we should probably try
1369 to recombine the surrogates into a single code
1370 unit.
1371 */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001372 errmsg = "illegal encoding";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001373 startinpos = s-starts;
1374 endinpos = startinpos+3;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001375 goto utf8Error;
1376 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001377 else
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001378 *p++ = (Py_UNICODE)ch;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001379 break;
1380
1381 case 4:
1382 if ((s[1] & 0xc0) != 0x80 ||
1383 (s[2] & 0xc0) != 0x80 ||
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001384 (s[3] & 0xc0) != 0x80) {
1385 errmsg = "invalid data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001386 startinpos = s-starts;
1387 endinpos = startinpos+4;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001388 goto utf8Error;
1389 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001390 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
1391 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
1392 /* validate and convert to UTF-16 */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001393 if ((ch < 0x10000) /* minimum value allowed for 4
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001394 byte encoding */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001395 || (ch > 0x10ffff)) /* maximum value allowed for
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001396 UTF-16 */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001397 {
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001398 errmsg = "illegal encoding";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001399 startinpos = s-starts;
1400 endinpos = startinpos+4;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001401 goto utf8Error;
1402 }
Fredrik Lundh8f455852001-06-27 18:59:43 +00001403#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001404 *p++ = (Py_UNICODE)ch;
1405#else
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001406 /* compute and append the two surrogates: */
Tim Petersced69f82003-09-16 20:30:58 +00001407
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001408 /* translate from 10000..10FFFF to 0..FFFF */
1409 ch -= 0x10000;
Tim Petersced69f82003-09-16 20:30:58 +00001410
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001411 /* high surrogate = top 10 bits added to D800 */
1412 *p++ = (Py_UNICODE)(0xD800 + (ch >> 10));
Tim Petersced69f82003-09-16 20:30:58 +00001413
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001414 /* low surrogate = bottom 10 bits added to DC00 */
Fredrik Lundh45714e92001-06-26 16:39:36 +00001415 *p++ = (Py_UNICODE)(0xDC00 + (ch & 0x03FF));
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001416#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00001417 break;
1418
1419 default:
1420 /* Other sizes are only needed for UCS-4 */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001421 errmsg = "unsupported Unicode code range";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001422 startinpos = s-starts;
1423 endinpos = startinpos+n;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001424 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001425 }
1426 s += n;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001427 continue;
Tim Petersced69f82003-09-16 20:30:58 +00001428
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001429 utf8Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001430 outpos = p-PyUnicode_AS_UNICODE(unicode);
1431 if (unicode_decode_call_errorhandler(
1432 errors, &errorHandler,
1433 "utf8", errmsg,
1434 starts, size, &startinpos, &endinpos, &exc, &s,
1435 (PyObject **)&unicode, &outpos, &p))
1436 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001437 }
Walter Dörwald69652032004-09-07 20:24:22 +00001438 if (consumed)
1439 *consumed = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001440
1441 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00001442 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001443 goto onError;
1444
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001445 Py_XDECREF(errorHandler);
1446 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001447 return (PyObject *)unicode;
1448
1449onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001450 Py_XDECREF(errorHandler);
1451 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001452 Py_DECREF(unicode);
1453 return NULL;
1454}
1455
Tim Peters602f7402002-04-27 18:03:26 +00001456/* Allocation strategy: if the string is short, convert into a stack buffer
1457 and allocate exactly as much space needed at the end. Else allocate the
1458 maximum possible needed (4 result bytes per Unicode character), and return
1459 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001460*/
Tim Peters7e3d9612002-04-21 03:26:37 +00001461PyObject *
1462PyUnicode_EncodeUTF8(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001463 Py_ssize_t size,
Tim Peters7e3d9612002-04-21 03:26:37 +00001464 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001465{
Tim Peters602f7402002-04-27 18:03:26 +00001466#define MAX_SHORT_UNICHARS 300 /* largest size we'll do on the stack */
Tim Peters0eca65c2002-04-21 17:28:06 +00001467
Martin v. Löwis18e16552006-02-15 17:27:45 +00001468 Py_ssize_t i; /* index into s of next input byte */
Tim Peters602f7402002-04-27 18:03:26 +00001469 PyObject *v; /* result string object */
1470 char *p; /* next free byte in output buffer */
Martin v. Löwis18e16552006-02-15 17:27:45 +00001471 Py_ssize_t nallocated; /* number of result bytes allocated */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001472 Py_ssize_t nneeded; /* number of result bytes needed */
Tim Peters602f7402002-04-27 18:03:26 +00001473 char stackbuf[MAX_SHORT_UNICHARS * 4];
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001474
Tim Peters602f7402002-04-27 18:03:26 +00001475 assert(s != NULL);
1476 assert(size >= 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001477
Tim Peters602f7402002-04-27 18:03:26 +00001478 if (size <= MAX_SHORT_UNICHARS) {
1479 /* Write into the stack buffer; nallocated can't overflow.
1480 * At the end, we'll allocate exactly as much heap space as it
1481 * turns out we need.
1482 */
1483 nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int);
1484 v = NULL; /* will allocate after we're done */
1485 p = stackbuf;
1486 }
1487 else {
1488 /* Overallocate on the heap, and give the excess back at the end. */
1489 nallocated = size * 4;
1490 if (nallocated / 4 != size) /* overflow! */
1491 return PyErr_NoMemory();
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001492 v = PyBytes_FromStringAndSize(NULL, nallocated);
Tim Peters602f7402002-04-27 18:03:26 +00001493 if (v == NULL)
1494 return NULL;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001495 p = PyBytes_AS_STRING(v);
Tim Peters602f7402002-04-27 18:03:26 +00001496 }
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001497
Tim Peters602f7402002-04-27 18:03:26 +00001498 for (i = 0; i < size;) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001499 Py_UCS4 ch = s[i++];
Marc-André Lemburg3688a882002-02-06 18:09:02 +00001500
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001501 if (ch < 0x80)
Tim Peters602f7402002-04-27 18:03:26 +00001502 /* Encode ASCII */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001503 *p++ = (char) ch;
Marc-André Lemburg3688a882002-02-06 18:09:02 +00001504
Guido van Rossumd57fd912000-03-10 22:53:23 +00001505 else if (ch < 0x0800) {
Tim Peters602f7402002-04-27 18:03:26 +00001506 /* Encode Latin-1 */
Marc-André Lemburgdc724d62002-02-06 18:20:19 +00001507 *p++ = (char)(0xc0 | (ch >> 6));
1508 *p++ = (char)(0x80 | (ch & 0x3f));
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001509 }
Marc-André Lemburg3688a882002-02-06 18:09:02 +00001510 else {
Tim Peters602f7402002-04-27 18:03:26 +00001511 /* Encode UCS2 Unicode ordinals */
1512 if (ch < 0x10000) {
1513 /* Special case: check for high surrogate */
1514 if (0xD800 <= ch && ch <= 0xDBFF && i != size) {
1515 Py_UCS4 ch2 = s[i];
1516 /* Check for low surrogate and combine the two to
1517 form a UCS4 value */
1518 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001519 ch = ((ch - 0xD800) << 10 | (ch2 - 0xDC00)) + 0x10000;
Tim Peters602f7402002-04-27 18:03:26 +00001520 i++;
1521 goto encodeUCS4;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001522 }
Tim Peters602f7402002-04-27 18:03:26 +00001523 /* Fall through: handles isolated high surrogates */
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001524 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001525 *p++ = (char)(0xe0 | (ch >> 12));
Tim Peters602f7402002-04-27 18:03:26 +00001526 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
1527 *p++ = (char)(0x80 | (ch & 0x3f));
1528 continue;
1529 }
1530encodeUCS4:
1531 /* Encode UCS4 Unicode ordinals */
1532 *p++ = (char)(0xf0 | (ch >> 18));
1533 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
1534 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
1535 *p++ = (char)(0x80 | (ch & 0x3f));
1536 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001537 }
Tim Peters0eca65c2002-04-21 17:28:06 +00001538
Tim Peters602f7402002-04-27 18:03:26 +00001539 if (v == NULL) {
1540 /* This was stack allocated. */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001541 nneeded = p - stackbuf;
Tim Peters602f7402002-04-27 18:03:26 +00001542 assert(nneeded <= nallocated);
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001543 v = PyBytes_FromStringAndSize(stackbuf, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00001544 }
1545 else {
1546 /* Cut back to size actually needed. */
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001547 nneeded = p - PyBytes_AS_STRING(v);
Tim Peters602f7402002-04-27 18:03:26 +00001548 assert(nneeded <= nallocated);
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001549 PyBytes_Resize(v, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00001550 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001551 return v;
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001552
Tim Peters602f7402002-04-27 18:03:26 +00001553#undef MAX_SHORT_UNICHARS
Guido van Rossumd57fd912000-03-10 22:53:23 +00001554}
1555
Guido van Rossumd57fd912000-03-10 22:53:23 +00001556PyObject *PyUnicode_AsUTF8String(PyObject *unicode)
1557{
Guido van Rossumd57fd912000-03-10 22:53:23 +00001558 if (!PyUnicode_Check(unicode)) {
1559 PyErr_BadArgument();
1560 return NULL;
1561 }
Barry Warsaw2dd4abf2000-08-18 06:58:15 +00001562 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
1563 PyUnicode_GET_SIZE(unicode),
1564 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001565}
1566
1567/* --- UTF-16 Codec ------------------------------------------------------- */
1568
Tim Peters772747b2001-08-09 22:21:55 +00001569PyObject *
1570PyUnicode_DecodeUTF16(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001571 Py_ssize_t size,
Tim Peters772747b2001-08-09 22:21:55 +00001572 const char *errors,
1573 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001574{
Walter Dörwald69652032004-09-07 20:24:22 +00001575 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
1576}
1577
1578PyObject *
1579PyUnicode_DecodeUTF16Stateful(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001580 Py_ssize_t size,
Walter Dörwald69652032004-09-07 20:24:22 +00001581 const char *errors,
1582 int *byteorder,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001583 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00001584{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001585 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001586 Py_ssize_t startinpos;
1587 Py_ssize_t endinpos;
1588 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001589 PyUnicodeObject *unicode;
1590 Py_UNICODE *p;
Tim Peters772747b2001-08-09 22:21:55 +00001591 const unsigned char *q, *e;
1592 int bo = 0; /* assume native ordering by default */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001593 const char *errmsg = "";
Tim Peters772747b2001-08-09 22:21:55 +00001594 /* Offsets from q for retrieving byte pairs in the right order. */
1595#ifdef BYTEORDER_IS_LITTLE_ENDIAN
1596 int ihi = 1, ilo = 0;
1597#else
1598 int ihi = 0, ilo = 1;
1599#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001600 PyObject *errorHandler = NULL;
1601 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001602
1603 /* Note: size will always be longer than the resulting Unicode
1604 character count */
1605 unicode = _PyUnicode_New(size);
1606 if (!unicode)
1607 return NULL;
1608 if (size == 0)
1609 return (PyObject *)unicode;
1610
1611 /* Unpack UTF-16 encoded data */
1612 p = unicode->str;
Tim Peters772747b2001-08-09 22:21:55 +00001613 q = (unsigned char *)s;
1614 e = q + size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001615
1616 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00001617 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001618
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001619 /* Check for BOM marks (U+FEFF) in the input and adjust current
1620 byte order setting accordingly. In native mode, the leading BOM
1621 mark is skipped, in all other modes, it is copied to the output
1622 stream as-is (giving a ZWNBSP character). */
1623 if (bo == 0) {
Walter Dörwald69652032004-09-07 20:24:22 +00001624 if (size >= 2) {
1625 const Py_UNICODE bom = (q[ihi] << 8) | q[ilo];
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001626#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Walter Dörwald69652032004-09-07 20:24:22 +00001627 if (bom == 0xFEFF) {
1628 q += 2;
1629 bo = -1;
1630 }
1631 else if (bom == 0xFFFE) {
1632 q += 2;
1633 bo = 1;
1634 }
Tim Petersced69f82003-09-16 20:30:58 +00001635#else
Walter Dörwald69652032004-09-07 20:24:22 +00001636 if (bom == 0xFEFF) {
1637 q += 2;
1638 bo = 1;
1639 }
1640 else if (bom == 0xFFFE) {
1641 q += 2;
1642 bo = -1;
1643 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001644#endif
Walter Dörwald69652032004-09-07 20:24:22 +00001645 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001646 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001647
Tim Peters772747b2001-08-09 22:21:55 +00001648 if (bo == -1) {
1649 /* force LE */
1650 ihi = 1;
1651 ilo = 0;
1652 }
1653 else if (bo == 1) {
1654 /* force BE */
1655 ihi = 0;
1656 ilo = 1;
1657 }
1658
1659 while (q < e) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001660 Py_UNICODE ch;
Walter Dörwald69652032004-09-07 20:24:22 +00001661 /* remaining bytes at the end? (size should be even) */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001662 if (e-q<2) {
Walter Dörwald69652032004-09-07 20:24:22 +00001663 if (consumed)
1664 break;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001665 errmsg = "truncated data";
1666 startinpos = ((const char *)q)-starts;
1667 endinpos = ((const char *)e)-starts;
1668 goto utf16Error;
1669 /* The remaining input chars are ignored if the callback
1670 chooses to skip the input */
1671 }
1672 ch = (q[ihi] << 8) | q[ilo];
1673
Tim Peters772747b2001-08-09 22:21:55 +00001674 q += 2;
1675
Guido van Rossumd57fd912000-03-10 22:53:23 +00001676 if (ch < 0xD800 || ch > 0xDFFF) {
1677 *p++ = ch;
1678 continue;
1679 }
1680
1681 /* UTF-16 code pair: */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001682 if (q >= e) {
1683 errmsg = "unexpected end of data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001684 startinpos = (((const char *)q)-2)-starts;
1685 endinpos = ((const char *)e)-starts;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001686 goto utf16Error;
1687 }
Martin v. Löwisac93bc22001-06-26 22:43:40 +00001688 if (0xD800 <= ch && ch <= 0xDBFF) {
Tim Peters772747b2001-08-09 22:21:55 +00001689 Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo];
1690 q += 2;
Martin v. Löwisac93bc22001-06-26 22:43:40 +00001691 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Fredrik Lundh8f455852001-06-27 18:59:43 +00001692#ifndef Py_UNICODE_WIDE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001693 *p++ = ch;
1694 *p++ = ch2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001695#else
1696 *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001697#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001698 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001699 }
1700 else {
1701 errmsg = "illegal UTF-16 surrogate";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001702 startinpos = (((const char *)q)-4)-starts;
1703 endinpos = startinpos+2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001704 goto utf16Error;
1705 }
1706
Guido van Rossumd57fd912000-03-10 22:53:23 +00001707 }
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001708 errmsg = "illegal encoding";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001709 startinpos = (((const char *)q)-2)-starts;
1710 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001711 /* Fall through to report the error */
1712
1713 utf16Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001714 outpos = p-PyUnicode_AS_UNICODE(unicode);
1715 if (unicode_decode_call_errorhandler(
1716 errors, &errorHandler,
1717 "utf16", errmsg,
1718 starts, size, &startinpos, &endinpos, &exc, (const char **)&q,
1719 (PyObject **)&unicode, &outpos, &p))
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001720 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001721 }
1722
1723 if (byteorder)
1724 *byteorder = bo;
1725
Walter Dörwald69652032004-09-07 20:24:22 +00001726 if (consumed)
1727 *consumed = (const char *)q-starts;
1728
Guido van Rossumd57fd912000-03-10 22:53:23 +00001729 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00001730 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001731 goto onError;
1732
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001733 Py_XDECREF(errorHandler);
1734 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001735 return (PyObject *)unicode;
1736
1737onError:
1738 Py_DECREF(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001739 Py_XDECREF(errorHandler);
1740 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001741 return NULL;
1742}
1743
Tim Peters772747b2001-08-09 22:21:55 +00001744PyObject *
1745PyUnicode_EncodeUTF16(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001746 Py_ssize_t size,
Tim Peters772747b2001-08-09 22:21:55 +00001747 const char *errors,
1748 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001749{
1750 PyObject *v;
Tim Peters772747b2001-08-09 22:21:55 +00001751 unsigned char *p;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00001752#ifdef Py_UNICODE_WIDE
Tim Peters772747b2001-08-09 22:21:55 +00001753 int i, pairs;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00001754#else
1755 const int pairs = 0;
1756#endif
Tim Peters772747b2001-08-09 22:21:55 +00001757 /* Offsets from p for storing byte pairs in the right order. */
1758#ifdef BYTEORDER_IS_LITTLE_ENDIAN
1759 int ihi = 1, ilo = 0;
1760#else
1761 int ihi = 0, ilo = 1;
1762#endif
1763
1764#define STORECHAR(CH) \
1765 do { \
1766 p[ihi] = ((CH) >> 8) & 0xff; \
1767 p[ilo] = (CH) & 0xff; \
1768 p += 2; \
1769 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001770
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00001771#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001772 for (i = pairs = 0; i < size; i++)
1773 if (s[i] >= 0x10000)
1774 pairs++;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00001775#endif
Walter Dörwald3cc34522007-05-04 10:48:27 +00001776 v = PyBytes_FromStringAndSize(NULL,
Tim Peters772747b2001-08-09 22:21:55 +00001777 2 * (size + pairs + (byteorder == 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +00001778 if (v == NULL)
1779 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001780
Walter Dörwald3cc34522007-05-04 10:48:27 +00001781 p = (unsigned char *)PyBytes_AS_STRING(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001782 if (byteorder == 0)
Tim Peters772747b2001-08-09 22:21:55 +00001783 STORECHAR(0xFEFF);
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00001784 if (size == 0)
Marc-André Lemburgb7520772000-08-14 11:29:19 +00001785 return v;
Tim Peters772747b2001-08-09 22:21:55 +00001786
1787 if (byteorder == -1) {
1788 /* force LE */
1789 ihi = 1;
1790 ilo = 0;
1791 }
1792 else if (byteorder == 1) {
1793 /* force BE */
1794 ihi = 0;
1795 ilo = 1;
1796 }
1797
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001798 while (size-- > 0) {
1799 Py_UNICODE ch = *s++;
1800 Py_UNICODE ch2 = 0;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00001801#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001802 if (ch >= 0x10000) {
Tim Peters772747b2001-08-09 22:21:55 +00001803 ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF);
1804 ch = 0xD800 | ((ch-0x10000) >> 10);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001805 }
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00001806#endif
Tim Peters772747b2001-08-09 22:21:55 +00001807 STORECHAR(ch);
1808 if (ch2)
1809 STORECHAR(ch2);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001810 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001811 return v;
Tim Peters772747b2001-08-09 22:21:55 +00001812#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00001813}
1814
1815PyObject *PyUnicode_AsUTF16String(PyObject *unicode)
1816{
1817 if (!PyUnicode_Check(unicode)) {
1818 PyErr_BadArgument();
1819 return NULL;
1820 }
1821 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
1822 PyUnicode_GET_SIZE(unicode),
1823 NULL,
1824 0);
1825}
1826
1827/* --- Unicode Escape Codec ----------------------------------------------- */
1828
Fredrik Lundh06d12682001-01-24 07:59:11 +00001829static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00001830
Guido van Rossumd57fd912000-03-10 22:53:23 +00001831PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001832 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00001833 const char *errors)
1834{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001835 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001836 Py_ssize_t startinpos;
1837 Py_ssize_t endinpos;
1838 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001839 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001840 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001841 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001842 const char *end;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001843 char* message;
1844 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001845 PyObject *errorHandler = NULL;
1846 PyObject *exc = NULL;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001847
Guido van Rossumd57fd912000-03-10 22:53:23 +00001848 /* Escaped strings will always be longer than the resulting
1849 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001850 length after conversion to the true value.
1851 (but if the error callback returns a long replacement string
1852 we'll have to allocate more space) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001853 v = _PyUnicode_New(size);
1854 if (v == NULL)
1855 goto onError;
1856 if (size == 0)
1857 return (PyObject *)v;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001858
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001859 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001860 end = s + size;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001861
Guido van Rossumd57fd912000-03-10 22:53:23 +00001862 while (s < end) {
1863 unsigned char c;
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00001864 Py_UNICODE x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001865 int digits;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001866
1867 /* Non-escape characters are interpreted as Unicode ordinals */
1868 if (*s != '\\') {
Fredrik Lundhccc74732001-02-18 22:13:49 +00001869 *p++ = (unsigned char) *s++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001870 continue;
1871 }
1872
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001873 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001874 /* \ - Escapes */
1875 s++;
1876 switch (*s++) {
1877
1878 /* \x escapes */
1879 case '\n': break;
1880 case '\\': *p++ = '\\'; break;
1881 case '\'': *p++ = '\''; break;
1882 case '\"': *p++ = '\"'; break;
1883 case 'b': *p++ = '\b'; break;
1884 case 'f': *p++ = '\014'; break; /* FF */
1885 case 't': *p++ = '\t'; break;
1886 case 'n': *p++ = '\n'; break;
1887 case 'r': *p++ = '\r'; break;
1888 case 'v': *p++ = '\013'; break; /* VT */
1889 case 'a': *p++ = '\007'; break; /* BEL, not classic C */
1890
1891 /* \OOO (octal) escapes */
1892 case '0': case '1': case '2': case '3':
1893 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001894 x = s[-1] - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00001895 if ('0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001896 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00001897 if ('0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001898 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00001899 }
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001900 *p++ = x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001901 break;
1902
Fredrik Lundhccc74732001-02-18 22:13:49 +00001903 /* hex escapes */
1904 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001905 case 'x':
Fredrik Lundhccc74732001-02-18 22:13:49 +00001906 digits = 2;
1907 message = "truncated \\xXX escape";
1908 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001909
Fredrik Lundhccc74732001-02-18 22:13:49 +00001910 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001911 case 'u':
Fredrik Lundhccc74732001-02-18 22:13:49 +00001912 digits = 4;
1913 message = "truncated \\uXXXX escape";
1914 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001915
Fredrik Lundhccc74732001-02-18 22:13:49 +00001916 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00001917 case 'U':
Fredrik Lundhccc74732001-02-18 22:13:49 +00001918 digits = 8;
1919 message = "truncated \\UXXXXXXXX escape";
1920 hexescape:
1921 chr = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001922 outpos = p-PyUnicode_AS_UNICODE(v);
1923 if (s+digits>end) {
1924 endinpos = size;
1925 if (unicode_decode_call_errorhandler(
1926 errors, &errorHandler,
1927 "unicodeescape", "end of string in escape sequence",
1928 starts, size, &startinpos, &endinpos, &exc, &s,
1929 (PyObject **)&v, &outpos, &p))
1930 goto onError;
1931 goto nextByte;
1932 }
1933 for (i = 0; i < digits; ++i) {
Fredrik Lundhccc74732001-02-18 22:13:49 +00001934 c = (unsigned char) s[i];
Fredrik Lundhdf846752000-09-03 11:29:49 +00001935 if (!isxdigit(c)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001936 endinpos = (s+i+1)-starts;
1937 if (unicode_decode_call_errorhandler(
1938 errors, &errorHandler,
1939 "unicodeescape", message,
1940 starts, size, &startinpos, &endinpos, &exc, &s,
1941 (PyObject **)&v, &outpos, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00001942 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001943 goto nextByte;
Fredrik Lundhdf846752000-09-03 11:29:49 +00001944 }
1945 chr = (chr<<4) & ~0xF;
1946 if (c >= '0' && c <= '9')
1947 chr += c - '0';
1948 else if (c >= 'a' && c <= 'f')
1949 chr += 10 + c - 'a';
1950 else
1951 chr += 10 + c - 'A';
1952 }
1953 s += i;
Jeremy Hylton504de6b2003-10-06 05:08:26 +00001954 if (chr == 0xffffffff && PyErr_Occurred())
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001955 /* _decoding_error will have already written into the
1956 target buffer. */
1957 break;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001958 store:
Fredrik Lundhdf846752000-09-03 11:29:49 +00001959 /* when we get here, chr is a 32-bit unicode character */
1960 if (chr <= 0xffff)
1961 /* UCS-2 character */
1962 *p++ = (Py_UNICODE) chr;
1963 else if (chr <= 0x10ffff) {
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001964 /* UCS-4 character. Either store directly, or as
Walter Dörwald8c077222002-03-25 11:16:18 +00001965 surrogate pair. */
Fredrik Lundh8f455852001-06-27 18:59:43 +00001966#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001967 *p++ = chr;
1968#else
Fredrik Lundhdf846752000-09-03 11:29:49 +00001969 chr -= 0x10000L;
1970 *p++ = 0xD800 + (Py_UNICODE) (chr >> 10);
Fredrik Lundh45714e92001-06-26 16:39:36 +00001971 *p++ = 0xDC00 + (Py_UNICODE) (chr & 0x03FF);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001972#endif
Fredrik Lundhdf846752000-09-03 11:29:49 +00001973 } else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001974 endinpos = s-starts;
1975 outpos = p-PyUnicode_AS_UNICODE(v);
1976 if (unicode_decode_call_errorhandler(
1977 errors, &errorHandler,
1978 "unicodeescape", "illegal Unicode character",
1979 starts, size, &startinpos, &endinpos, &exc, &s,
1980 (PyObject **)&v, &outpos, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00001981 goto onError;
1982 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00001983 break;
1984
1985 /* \N{name} */
1986 case 'N':
1987 message = "malformed \\N character escape";
1988 if (ucnhash_CAPI == NULL) {
1989 /* load the unicode data module */
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00001990 PyObject *m, *api;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001991 m = PyImport_ImportModule("unicodedata");
1992 if (m == NULL)
1993 goto ucnhashError;
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00001994 api = PyObject_GetAttrString(m, "ucnhash_CAPI");
Fredrik Lundhccc74732001-02-18 22:13:49 +00001995 Py_DECREF(m);
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00001996 if (api == NULL)
Fredrik Lundhccc74732001-02-18 22:13:49 +00001997 goto ucnhashError;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001998 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCObject_AsVoidPtr(api);
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00001999 Py_DECREF(api);
Fredrik Lundhccc74732001-02-18 22:13:49 +00002000 if (ucnhash_CAPI == NULL)
2001 goto ucnhashError;
2002 }
2003 if (*s == '{') {
2004 const char *start = s+1;
2005 /* look for the closing brace */
2006 while (*s != '}' && s < end)
2007 s++;
2008 if (s > start && s < end && *s == '}') {
2009 /* found a name. look it up in the unicode database */
2010 message = "unknown Unicode character name";
2011 s++;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +00002012 if (ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1), &chr))
Fredrik Lundhccc74732001-02-18 22:13:49 +00002013 goto store;
2014 }
2015 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002016 endinpos = s-starts;
2017 outpos = p-PyUnicode_AS_UNICODE(v);
2018 if (unicode_decode_call_errorhandler(
2019 errors, &errorHandler,
2020 "unicodeescape", message,
2021 starts, size, &startinpos, &endinpos, &exc, &s,
2022 (PyObject **)&v, &outpos, &p))
Fredrik Lundhccc74732001-02-18 22:13:49 +00002023 goto onError;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002024 break;
2025
2026 default:
Walter Dörwald8c077222002-03-25 11:16:18 +00002027 if (s > end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002028 message = "\\ at end of string";
2029 s--;
2030 endinpos = s-starts;
2031 outpos = p-PyUnicode_AS_UNICODE(v);
2032 if (unicode_decode_call_errorhandler(
2033 errors, &errorHandler,
2034 "unicodeescape", message,
2035 starts, size, &startinpos, &endinpos, &exc, &s,
2036 (PyObject **)&v, &outpos, &p))
Walter Dörwald8c077222002-03-25 11:16:18 +00002037 goto onError;
2038 }
2039 else {
2040 *p++ = '\\';
2041 *p++ = (unsigned char)s[-1];
2042 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00002043 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002044 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002045 nextByte:
2046 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002047 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002048 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002049 goto onError;
Walter Dörwaldd4ade082003-08-15 15:00:26 +00002050 Py_XDECREF(errorHandler);
2051 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002052 return (PyObject *)v;
Walter Dörwald8c077222002-03-25 11:16:18 +00002053
Fredrik Lundhccc74732001-02-18 22:13:49 +00002054ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00002055 PyErr_SetString(
2056 PyExc_UnicodeError,
2057 "\\N escapes not supported (can't load unicodedata module)"
2058 );
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00002059 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002060 Py_XDECREF(errorHandler);
2061 Py_XDECREF(exc);
Fredrik Lundhf6056062001-01-20 11:15:25 +00002062 return NULL;
2063
Fredrik Lundhccc74732001-02-18 22:13:49 +00002064onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00002065 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002066 Py_XDECREF(errorHandler);
2067 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002068 return NULL;
2069}
2070
2071/* Return a Unicode-Escape string version of the Unicode object.
2072
2073 If quotes is true, the string is enclosed in u"" or u'' quotes as
2074 appropriate.
2075
2076*/
2077
Thomas Wouters477c8d52006-05-27 19:21:47 +00002078Py_LOCAL_INLINE(const Py_UNICODE *) findchar(const Py_UNICODE *s,
2079 Py_ssize_t size,
2080 Py_UNICODE ch)
2081{
2082 /* like wcschr, but doesn't stop at NULL characters */
2083
2084 while (size-- > 0) {
2085 if (*s == ch)
2086 return s;
2087 s++;
2088 }
2089
2090 return NULL;
2091}
Barry Warsaw51ac5802000-03-20 16:36:48 +00002092
Guido van Rossumd57fd912000-03-10 22:53:23 +00002093static
2094PyObject *unicodeescape_string(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002095 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00002096 int quotes)
2097{
2098 PyObject *repr;
2099 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002100
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00002101 static const char *hexdigit = "0123456789abcdef";
Guido van Rossumd57fd912000-03-10 22:53:23 +00002102
Thomas Wouters89f507f2006-12-13 04:49:30 +00002103 /* XXX(nnorwitz): rather than over-allocating, it would be
2104 better to choose a different scheme. Perhaps scan the
2105 first N-chars of the string and allocate based on that size.
2106 */
2107 /* Initial allocation is based on the longest-possible unichr
2108 escape.
2109
2110 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
2111 unichr, so in this case it's the longest unichr escape. In
2112 narrow (UTF-16) builds this is five chars per source unichr
2113 since there are two unichrs in the surrogate pair, so in narrow
2114 (UTF-16) builds it's not the longest unichr escape.
2115
2116 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
2117 so in the narrow (UTF-16) build case it's the longest unichr
2118 escape.
2119 */
2120
2121 repr = PyString_FromStringAndSize(NULL,
2122 2
2123#ifdef Py_UNICODE_WIDE
2124 + 10*size
2125#else
2126 + 6*size
2127#endif
2128 + 1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002129 if (repr == NULL)
2130 return NULL;
2131
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002132 p = PyString_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002133
2134 if (quotes) {
Tim Petersced69f82003-09-16 20:30:58 +00002135 *p++ = (findchar(s, size, '\'') &&
Guido van Rossumd57fd912000-03-10 22:53:23 +00002136 !findchar(s, size, '"')) ? '"' : '\'';
2137 }
2138 while (size-- > 0) {
2139 Py_UNICODE ch = *s++;
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002140
Hye-Shik Chang835b2432005-12-17 04:38:31 +00002141 /* Escape quotes and backslashes */
2142 if ((quotes &&
Guido van Rossum572dbf82007-04-27 23:53:51 +00002143 ch == (Py_UNICODE) PyString_AS_STRING(repr)[0]) || ch == '\\') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002144 *p++ = '\\';
2145 *p++ = (char) ch;
Guido van Rossumad9744a2001-09-21 15:38:17 +00002146 continue;
Tim Petersced69f82003-09-16 20:30:58 +00002147 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002148
Guido van Rossum0d42e0c2001-07-20 16:36:21 +00002149#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002150 /* Map 21-bit characters to '\U00xxxxxx' */
2151 else if (ch >= 0x10000) {
2152 *p++ = '\\';
2153 *p++ = 'U';
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002154 *p++ = hexdigit[(ch >> 28) & 0x0000000F];
2155 *p++ = hexdigit[(ch >> 24) & 0x0000000F];
2156 *p++ = hexdigit[(ch >> 20) & 0x0000000F];
2157 *p++ = hexdigit[(ch >> 16) & 0x0000000F];
2158 *p++ = hexdigit[(ch >> 12) & 0x0000000F];
2159 *p++ = hexdigit[(ch >> 8) & 0x0000000F];
2160 *p++ = hexdigit[(ch >> 4) & 0x0000000F];
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002161 *p++ = hexdigit[ch & 0x0000000F];
2162 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002163 }
Thomas Wouters89f507f2006-12-13 04:49:30 +00002164#else
2165 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002166 else if (ch >= 0xD800 && ch < 0xDC00) {
2167 Py_UNICODE ch2;
2168 Py_UCS4 ucs;
Tim Petersced69f82003-09-16 20:30:58 +00002169
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002170 ch2 = *s++;
2171 size--;
2172 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
2173 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
2174 *p++ = '\\';
2175 *p++ = 'U';
2176 *p++ = hexdigit[(ucs >> 28) & 0x0000000F];
2177 *p++ = hexdigit[(ucs >> 24) & 0x0000000F];
2178 *p++ = hexdigit[(ucs >> 20) & 0x0000000F];
2179 *p++ = hexdigit[(ucs >> 16) & 0x0000000F];
2180 *p++ = hexdigit[(ucs >> 12) & 0x0000000F];
2181 *p++ = hexdigit[(ucs >> 8) & 0x0000000F];
2182 *p++ = hexdigit[(ucs >> 4) & 0x0000000F];
2183 *p++ = hexdigit[ucs & 0x0000000F];
2184 continue;
2185 }
2186 /* Fall through: isolated surrogates are copied as-is */
2187 s--;
2188 size++;
2189 }
Thomas Wouters89f507f2006-12-13 04:49:30 +00002190#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002191
Guido van Rossumd57fd912000-03-10 22:53:23 +00002192 /* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002193 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002194 *p++ = '\\';
2195 *p++ = 'u';
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002196 *p++ = hexdigit[(ch >> 12) & 0x000F];
2197 *p++ = hexdigit[(ch >> 8) & 0x000F];
2198 *p++ = hexdigit[(ch >> 4) & 0x000F];
2199 *p++ = hexdigit[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00002200 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002201
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00002202 /* Map special whitespace to '\t', \n', '\r' */
2203 else if (ch == '\t') {
2204 *p++ = '\\';
2205 *p++ = 't';
2206 }
2207 else if (ch == '\n') {
2208 *p++ = '\\';
2209 *p++ = 'n';
2210 }
2211 else if (ch == '\r') {
2212 *p++ = '\\';
2213 *p++ = 'r';
2214 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002215
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00002216 /* Map non-printable US ASCII to '\xhh' */
Marc-André Lemburg11326de2001-11-28 12:56:20 +00002217 else if (ch < ' ' || ch >= 0x7F) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002218 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00002219 *p++ = 'x';
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002220 *p++ = hexdigit[(ch >> 4) & 0x000F];
2221 *p++ = hexdigit[ch & 0x000F];
Tim Petersced69f82003-09-16 20:30:58 +00002222 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002223
Guido van Rossumd57fd912000-03-10 22:53:23 +00002224 /* Copy everything else as-is */
2225 else
2226 *p++ = (char) ch;
2227 }
2228 if (quotes)
Guido van Rossum572dbf82007-04-27 23:53:51 +00002229 *p++ = PyString_AS_STRING(repr)[0];
Guido van Rossumd57fd912000-03-10 22:53:23 +00002230
2231 *p = '\0';
Tim Peters5de98422002-04-27 18:44:32 +00002232 _PyString_Resize(&repr, p - PyString_AS_STRING(repr));
Guido van Rossumd57fd912000-03-10 22:53:23 +00002233 return repr;
2234}
2235
2236PyObject *PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002237 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002238{
2239 return unicodeescape_string(s, size, 0);
2240}
2241
2242PyObject *PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
2243{
2244 if (!PyUnicode_Check(unicode)) {
2245 PyErr_BadArgument();
2246 return NULL;
2247 }
2248 return PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
2249 PyUnicode_GET_SIZE(unicode));
2250}
2251
2252/* --- Raw Unicode Escape Codec ------------------------------------------- */
2253
2254PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002255 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00002256 const char *errors)
2257{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002258 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002259 Py_ssize_t startinpos;
2260 Py_ssize_t endinpos;
2261 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002262 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002263 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002264 const char *end;
2265 const char *bs;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002266 PyObject *errorHandler = NULL;
2267 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00002268
Guido van Rossumd57fd912000-03-10 22:53:23 +00002269 /* Escaped strings will always be longer than the resulting
2270 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002271 length after conversion to the true value. (But decoding error
2272 handler might have to resize the string) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002273 v = _PyUnicode_New(size);
2274 if (v == NULL)
2275 goto onError;
2276 if (size == 0)
2277 return (PyObject *)v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002278 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002279 end = s + size;
2280 while (s < end) {
2281 unsigned char c;
Martin v. Löwis047c05e2002-03-21 08:55:28 +00002282 Py_UCS4 x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002283 int i;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002284 int count;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002285
2286 /* Non-escape characters are interpreted as Unicode ordinals */
2287 if (*s != '\\') {
2288 *p++ = (unsigned char)*s++;
2289 continue;
2290 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002291 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002292
2293 /* \u-escapes are only interpreted iff the number of leading
2294 backslashes if odd */
2295 bs = s;
2296 for (;s < end;) {
2297 if (*s != '\\')
2298 break;
2299 *p++ = (unsigned char)*s++;
2300 }
2301 if (((s - bs) & 1) == 0 ||
2302 s >= end ||
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002303 (*s != 'u' && *s != 'U')) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002304 continue;
2305 }
2306 p--;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002307 count = *s=='u' ? 4 : 8;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002308 s++;
2309
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002310 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002311 outpos = p-PyUnicode_AS_UNICODE(v);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002312 for (x = 0, i = 0; i < count; ++i, ++s) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002313 c = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002314 if (!isxdigit(c)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002315 endinpos = s-starts;
2316 if (unicode_decode_call_errorhandler(
2317 errors, &errorHandler,
2318 "rawunicodeescape", "truncated \\uXXXX",
2319 starts, size, &startinpos, &endinpos, &exc, &s,
2320 (PyObject **)&v, &outpos, &p))
Guido van Rossumd57fd912000-03-10 22:53:23 +00002321 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002322 goto nextByte;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002323 }
2324 x = (x<<4) & ~0xF;
2325 if (c >= '0' && c <= '9')
2326 x += c - '0';
2327 else if (c >= 'a' && c <= 'f')
2328 x += 10 + c - 'a';
2329 else
2330 x += 10 + c - 'A';
2331 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002332#ifndef Py_UNICODE_WIDE
2333 if (x > 0x10000) {
2334 if (unicode_decode_call_errorhandler(
2335 errors, &errorHandler,
2336 "rawunicodeescape", "\\Uxxxxxxxx out of range",
2337 starts, size, &startinpos, &endinpos, &exc, &s,
2338 (PyObject **)&v, &outpos, &p))
2339 goto onError;
2340 }
2341#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002342 *p++ = x;
2343 nextByte:
2344 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002345 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002346 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002347 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002348 Py_XDECREF(errorHandler);
2349 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002350 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00002351
Guido van Rossumd57fd912000-03-10 22:53:23 +00002352 onError:
2353 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002354 Py_XDECREF(errorHandler);
2355 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002356 return NULL;
2357}
2358
2359PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002360 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002361{
2362 PyObject *repr;
2363 char *p;
2364 char *q;
2365
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00002366 static const char *hexdigit = "0123456789abcdef";
Guido van Rossumd57fd912000-03-10 22:53:23 +00002367
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002368#ifdef Py_UNICODE_WIDE
2369 repr = PyString_FromStringAndSize(NULL, 10 * size);
2370#else
Guido van Rossumd57fd912000-03-10 22:53:23 +00002371 repr = PyString_FromStringAndSize(NULL, 6 * size);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002372#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00002373 if (repr == NULL)
2374 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00002375 if (size == 0)
2376 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002377
2378 p = q = PyString_AS_STRING(repr);
2379 while (size-- > 0) {
2380 Py_UNICODE ch = *s++;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002381#ifdef Py_UNICODE_WIDE
2382 /* Map 32-bit characters to '\Uxxxxxxxx' */
2383 if (ch >= 0x10000) {
2384 *p++ = '\\';
2385 *p++ = 'U';
2386 *p++ = hexdigit[(ch >> 28) & 0xf];
2387 *p++ = hexdigit[(ch >> 24) & 0xf];
2388 *p++ = hexdigit[(ch >> 20) & 0xf];
2389 *p++ = hexdigit[(ch >> 16) & 0xf];
2390 *p++ = hexdigit[(ch >> 12) & 0xf];
2391 *p++ = hexdigit[(ch >> 8) & 0xf];
2392 *p++ = hexdigit[(ch >> 4) & 0xf];
2393 *p++ = hexdigit[ch & 15];
Tim Petersced69f82003-09-16 20:30:58 +00002394 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002395 else
2396#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00002397 /* Map 16-bit characters to '\uxxxx' */
2398 if (ch >= 256) {
2399 *p++ = '\\';
2400 *p++ = 'u';
2401 *p++ = hexdigit[(ch >> 12) & 0xf];
2402 *p++ = hexdigit[(ch >> 8) & 0xf];
2403 *p++ = hexdigit[(ch >> 4) & 0xf];
2404 *p++ = hexdigit[ch & 15];
2405 }
2406 /* Copy everything else as-is */
2407 else
2408 *p++ = (char) ch;
2409 }
2410 *p = '\0';
Tim Peters5de98422002-04-27 18:44:32 +00002411 _PyString_Resize(&repr, p - q);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002412 return repr;
2413}
2414
2415PyObject *PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
2416{
2417 if (!PyUnicode_Check(unicode)) {
2418 PyErr_BadArgument();
2419 return NULL;
2420 }
2421 return PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
2422 PyUnicode_GET_SIZE(unicode));
2423}
2424
Walter Dörwalda47d1c02005-08-30 10:23:14 +00002425/* --- Unicode Internal Codec ------------------------------------------- */
2426
2427PyObject *_PyUnicode_DecodeUnicodeInternal(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002428 Py_ssize_t size,
Walter Dörwalda47d1c02005-08-30 10:23:14 +00002429 const char *errors)
2430{
2431 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002432 Py_ssize_t startinpos;
2433 Py_ssize_t endinpos;
2434 Py_ssize_t outpos;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00002435 PyUnicodeObject *v;
2436 Py_UNICODE *p;
2437 const char *end;
2438 const char *reason;
2439 PyObject *errorHandler = NULL;
2440 PyObject *exc = NULL;
2441
Neal Norwitzd43069c2006-01-08 01:12:10 +00002442#ifdef Py_UNICODE_WIDE
2443 Py_UNICODE unimax = PyUnicode_GetMax();
2444#endif
2445
Thomas Wouters89f507f2006-12-13 04:49:30 +00002446 /* XXX overflow detection missing */
Walter Dörwalda47d1c02005-08-30 10:23:14 +00002447 v = _PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE);
2448 if (v == NULL)
2449 goto onError;
2450 if (PyUnicode_GetSize((PyObject *)v) == 0)
2451 return (PyObject *)v;
2452 p = PyUnicode_AS_UNICODE(v);
2453 end = s + size;
2454
2455 while (s < end) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00002456 memcpy(p, s, sizeof(Py_UNICODE));
Walter Dörwalda47d1c02005-08-30 10:23:14 +00002457 /* We have to sanity check the raw data, otherwise doom looms for
2458 some malformed UCS-4 data. */
2459 if (
2460 #ifdef Py_UNICODE_WIDE
2461 *p > unimax || *p < 0 ||
2462 #endif
2463 end-s < Py_UNICODE_SIZE
2464 )
2465 {
2466 startinpos = s - starts;
2467 if (end-s < Py_UNICODE_SIZE) {
2468 endinpos = end-starts;
2469 reason = "truncated input";
2470 }
2471 else {
2472 endinpos = s - starts + Py_UNICODE_SIZE;
2473 reason = "illegal code point (> 0x10FFFF)";
2474 }
2475 outpos = p - PyUnicode_AS_UNICODE(v);
2476 if (unicode_decode_call_errorhandler(
2477 errors, &errorHandler,
2478 "unicode_internal", reason,
2479 starts, size, &startinpos, &endinpos, &exc, &s,
2480 (PyObject **)&v, &outpos, &p)) {
2481 goto onError;
2482 }
2483 }
2484 else {
2485 p++;
2486 s += Py_UNICODE_SIZE;
2487 }
2488 }
2489
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002490 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00002491 goto onError;
2492 Py_XDECREF(errorHandler);
2493 Py_XDECREF(exc);
2494 return (PyObject *)v;
2495
2496 onError:
2497 Py_XDECREF(v);
2498 Py_XDECREF(errorHandler);
2499 Py_XDECREF(exc);
2500 return NULL;
2501}
2502
Guido van Rossumd57fd912000-03-10 22:53:23 +00002503/* --- Latin-1 Codec ------------------------------------------------------ */
2504
2505PyObject *PyUnicode_DecodeLatin1(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002506 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00002507 const char *errors)
2508{
2509 PyUnicodeObject *v;
2510 Py_UNICODE *p;
Tim Petersced69f82003-09-16 20:30:58 +00002511
Guido van Rossumd57fd912000-03-10 22:53:23 +00002512 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002513 if (size == 1) {
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002514 Py_UNICODE r = *(unsigned char*)s;
2515 return PyUnicode_FromUnicode(&r, 1);
2516 }
2517
Guido van Rossumd57fd912000-03-10 22:53:23 +00002518 v = _PyUnicode_New(size);
2519 if (v == NULL)
2520 goto onError;
2521 if (size == 0)
2522 return (PyObject *)v;
2523 p = PyUnicode_AS_UNICODE(v);
2524 while (size-- > 0)
2525 *p++ = (unsigned char)*s++;
2526 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00002527
Guido van Rossumd57fd912000-03-10 22:53:23 +00002528 onError:
2529 Py_XDECREF(v);
2530 return NULL;
2531}
2532
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002533/* create or adjust a UnicodeEncodeError */
2534static void make_encode_exception(PyObject **exceptionObject,
2535 const char *encoding,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002536 const Py_UNICODE *unicode, Py_ssize_t size,
2537 Py_ssize_t startpos, Py_ssize_t endpos,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002538 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002539{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002540 if (*exceptionObject == NULL) {
2541 *exceptionObject = PyUnicodeEncodeError_Create(
2542 encoding, unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002543 }
2544 else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002545 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
2546 goto onError;
2547 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
2548 goto onError;
2549 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
2550 goto onError;
2551 return;
2552 onError:
2553 Py_DECREF(*exceptionObject);
2554 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002555 }
2556}
2557
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002558/* raises a UnicodeEncodeError */
2559static void raise_encode_exception(PyObject **exceptionObject,
2560 const char *encoding,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002561 const Py_UNICODE *unicode, Py_ssize_t size,
2562 Py_ssize_t startpos, Py_ssize_t endpos,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002563 const char *reason)
2564{
2565 make_encode_exception(exceptionObject,
2566 encoding, unicode, size, startpos, endpos, reason);
2567 if (*exceptionObject != NULL)
2568 PyCodec_StrictErrors(*exceptionObject);
2569}
2570
2571/* error handling callback helper:
2572 build arguments, call the callback and check the arguments,
2573 put the result into newpos and return the replacement string, which
2574 has to be freed by the caller */
2575static PyObject *unicode_encode_call_errorhandler(const char *errors,
2576 PyObject **errorHandler,
2577 const char *encoding, const char *reason,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002578 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
2579 Py_ssize_t startpos, Py_ssize_t endpos,
2580 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002581{
Martin v. Löwis18e16552006-02-15 17:27:45 +00002582 static char *argparse = "O!n;encoding error handler must return (unicode, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002583
2584 PyObject *restuple;
2585 PyObject *resunicode;
2586
2587 if (*errorHandler == NULL) {
2588 *errorHandler = PyCodec_LookupError(errors);
2589 if (*errorHandler == NULL)
2590 return NULL;
2591 }
2592
2593 make_encode_exception(exceptionObject,
2594 encoding, unicode, size, startpos, endpos, reason);
2595 if (*exceptionObject == NULL)
2596 return NULL;
2597
2598 restuple = PyObject_CallFunctionObjArgs(
2599 *errorHandler, *exceptionObject, NULL);
2600 if (restuple == NULL)
2601 return NULL;
2602 if (!PyTuple_Check(restuple)) {
2603 PyErr_Format(PyExc_TypeError, &argparse[4]);
2604 Py_DECREF(restuple);
2605 return NULL;
2606 }
2607 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
2608 &resunicode, newpos)) {
2609 Py_DECREF(restuple);
2610 return NULL;
2611 }
2612 if (*newpos<0)
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00002613 *newpos = size+*newpos;
2614 if (*newpos<0 || *newpos>size) {
Martin v. Löwis2c95cc62006-02-16 06:54:25 +00002615 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00002616 Py_DECREF(restuple);
2617 return NULL;
2618 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002619 Py_INCREF(resunicode);
2620 Py_DECREF(restuple);
2621 return resunicode;
2622}
2623
2624static PyObject *unicode_encode_ucs1(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002625 Py_ssize_t size,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002626 const char *errors,
2627 int limit)
2628{
2629 /* output object */
2630 PyObject *res;
2631 /* pointers to the beginning and end+1 of input */
2632 const Py_UNICODE *startp = p;
2633 const Py_UNICODE *endp = p + size;
2634 /* pointer to the beginning of the unencodable characters */
2635 /* const Py_UNICODE *badp = NULL; */
2636 /* pointer into the output */
2637 char *str;
2638 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00002639 Py_ssize_t respos = 0;
2640 Py_ssize_t ressize;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002641 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
2642 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002643 PyObject *errorHandler = NULL;
2644 PyObject *exc = NULL;
2645 /* the following variable is used for caching string comparisons
2646 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
2647 int known_errorHandler = -1;
2648
2649 /* allocate enough for a simple encoding without
2650 replacements, if we need more, we'll resize */
Guido van Rossumf15a29f2007-05-04 00:41:39 +00002651 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002652 if (res == NULL)
2653 goto onError;
2654 if (size == 0)
2655 return res;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00002656 str = PyBytes_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002657 ressize = size;
2658
2659 while (p<endp) {
2660 Py_UNICODE c = *p;
2661
2662 /* can we encode this? */
2663 if (c<limit) {
2664 /* no overflow check, because we know that the space is enough */
2665 *str++ = (char)c;
2666 ++p;
2667 }
2668 else {
Martin v. Löwis18e16552006-02-15 17:27:45 +00002669 Py_ssize_t unicodepos = p-startp;
2670 Py_ssize_t requiredsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002671 PyObject *repunicode;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002672 Py_ssize_t repsize;
2673 Py_ssize_t newpos;
2674 Py_ssize_t respos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002675 Py_UNICODE *uni2;
2676 /* startpos for collecting unencodable chars */
2677 const Py_UNICODE *collstart = p;
2678 const Py_UNICODE *collend = p;
2679 /* find all unecodable characters */
2680 while ((collend < endp) && ((*collend)>=limit))
2681 ++collend;
2682 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
2683 if (known_errorHandler==-1) {
2684 if ((errors==NULL) || (!strcmp(errors, "strict")))
2685 known_errorHandler = 1;
2686 else if (!strcmp(errors, "replace"))
2687 known_errorHandler = 2;
2688 else if (!strcmp(errors, "ignore"))
2689 known_errorHandler = 3;
2690 else if (!strcmp(errors, "xmlcharrefreplace"))
2691 known_errorHandler = 4;
2692 else
2693 known_errorHandler = 0;
2694 }
2695 switch (known_errorHandler) {
2696 case 1: /* strict */
2697 raise_encode_exception(&exc, encoding, startp, size, collstart-startp, collend-startp, reason);
2698 goto onError;
2699 case 2: /* replace */
2700 while (collstart++<collend)
2701 *str++ = '?'; /* fall through */
2702 case 3: /* ignore */
2703 p = collend;
2704 break;
2705 case 4: /* xmlcharrefreplace */
Guido van Rossumf15a29f2007-05-04 00:41:39 +00002706 respos = str - PyBytes_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002707 /* determine replacement size (temporarily (mis)uses p) */
2708 for (p = collstart, repsize = 0; p < collend; ++p) {
2709 if (*p<10)
2710 repsize += 2+1+1;
2711 else if (*p<100)
2712 repsize += 2+2+1;
2713 else if (*p<1000)
2714 repsize += 2+3+1;
2715 else if (*p<10000)
2716 repsize += 2+4+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00002717#ifndef Py_UNICODE_WIDE
2718 else
2719 repsize += 2+5+1;
2720#else
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002721 else if (*p<100000)
2722 repsize += 2+5+1;
2723 else if (*p<1000000)
2724 repsize += 2+6+1;
2725 else
2726 repsize += 2+7+1;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002727#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002728 }
2729 requiredsize = respos+repsize+(endp-collend);
2730 if (requiredsize > ressize) {
2731 if (requiredsize<2*ressize)
2732 requiredsize = 2*ressize;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00002733 if (PyBytes_Resize(res, requiredsize))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002734 goto onError;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00002735 str = PyBytes_AS_STRING(res) + respos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002736 ressize = requiredsize;
2737 }
2738 /* generate replacement (temporarily (mis)uses p) */
2739 for (p = collstart; p < collend; ++p) {
2740 str += sprintf(str, "&#%d;", (int)*p);
2741 }
2742 p = collend;
2743 break;
2744 default:
2745 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
2746 encoding, reason, startp, size, &exc,
2747 collstart-startp, collend-startp, &newpos);
2748 if (repunicode == NULL)
2749 goto onError;
2750 /* need more space? (at least enough for what we
2751 have+the replacement+the rest of the string, so
2752 we won't have to check space for encodable characters) */
Guido van Rossumf15a29f2007-05-04 00:41:39 +00002753 respos = str - PyBytes_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002754 repsize = PyUnicode_GET_SIZE(repunicode);
2755 requiredsize = respos+repsize+(endp-collend);
2756 if (requiredsize > ressize) {
2757 if (requiredsize<2*ressize)
2758 requiredsize = 2*ressize;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00002759 if (PyBytes_Resize(res, requiredsize)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002760 Py_DECREF(repunicode);
2761 goto onError;
2762 }
Guido van Rossumf15a29f2007-05-04 00:41:39 +00002763 str = PyBytes_AS_STRING(res) + respos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002764 ressize = requiredsize;
2765 }
2766 /* check if there is anything unencodable in the replacement
2767 and copy it to the output */
2768 for (uni2 = PyUnicode_AS_UNICODE(repunicode);repsize-->0; ++uni2, ++str) {
2769 c = *uni2;
2770 if (c >= limit) {
2771 raise_encode_exception(&exc, encoding, startp, size,
2772 unicodepos, unicodepos+1, reason);
2773 Py_DECREF(repunicode);
2774 goto onError;
2775 }
2776 *str = (char)c;
2777 }
2778 p = startp + newpos;
2779 Py_DECREF(repunicode);
2780 }
2781 }
2782 }
2783 /* Resize if we allocated to much */
Guido van Rossumf15a29f2007-05-04 00:41:39 +00002784 respos = str - PyBytes_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002785 if (respos<ressize)
2786 /* If this falls res will be NULL */
Guido van Rossumf15a29f2007-05-04 00:41:39 +00002787 PyBytes_Resize(res, respos);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002788 Py_XDECREF(errorHandler);
2789 Py_XDECREF(exc);
2790 return res;
2791
2792 onError:
2793 Py_XDECREF(res);
2794 Py_XDECREF(errorHandler);
2795 Py_XDECREF(exc);
2796 return NULL;
2797}
2798
Guido van Rossumd57fd912000-03-10 22:53:23 +00002799PyObject *PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002800 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00002801 const char *errors)
2802{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002803 return unicode_encode_ucs1(p, size, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002804}
2805
2806PyObject *PyUnicode_AsLatin1String(PyObject *unicode)
2807{
2808 if (!PyUnicode_Check(unicode)) {
2809 PyErr_BadArgument();
2810 return NULL;
2811 }
2812 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
2813 PyUnicode_GET_SIZE(unicode),
2814 NULL);
2815}
2816
2817/* --- 7-bit ASCII Codec -------------------------------------------------- */
2818
Guido van Rossumd57fd912000-03-10 22:53:23 +00002819PyObject *PyUnicode_DecodeASCII(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002820 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00002821 const char *errors)
2822{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002823 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002824 PyUnicodeObject *v;
2825 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002826 Py_ssize_t startinpos;
2827 Py_ssize_t endinpos;
2828 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002829 const char *e;
2830 PyObject *errorHandler = NULL;
2831 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00002832
Guido van Rossumd57fd912000-03-10 22:53:23 +00002833 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002834 if (size == 1 && *(unsigned char*)s < 128) {
2835 Py_UNICODE r = *(unsigned char*)s;
2836 return PyUnicode_FromUnicode(&r, 1);
2837 }
Tim Petersced69f82003-09-16 20:30:58 +00002838
Guido van Rossumd57fd912000-03-10 22:53:23 +00002839 v = _PyUnicode_New(size);
2840 if (v == NULL)
2841 goto onError;
2842 if (size == 0)
2843 return (PyObject *)v;
2844 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002845 e = s + size;
2846 while (s < e) {
2847 register unsigned char c = (unsigned char)*s;
2848 if (c < 128) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002849 *p++ = c;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002850 ++s;
2851 }
2852 else {
2853 startinpos = s-starts;
2854 endinpos = startinpos + 1;
Jeremy Hyltond8082792003-09-16 19:41:39 +00002855 outpos = p - (Py_UNICODE *)PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002856 if (unicode_decode_call_errorhandler(
2857 errors, &errorHandler,
2858 "ascii", "ordinal not in range(128)",
2859 starts, size, &startinpos, &endinpos, &exc, &s,
2860 (PyObject **)&v, &outpos, &p))
Guido van Rossumd57fd912000-03-10 22:53:23 +00002861 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002862 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002863 }
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002864 if (p - PyUnicode_AS_UNICODE(v) < PyString_GET_SIZE(v))
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002865 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002866 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002867 Py_XDECREF(errorHandler);
2868 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002869 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00002870
Guido van Rossumd57fd912000-03-10 22:53:23 +00002871 onError:
2872 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002873 Py_XDECREF(errorHandler);
2874 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002875 return NULL;
2876}
2877
Guido van Rossumd57fd912000-03-10 22:53:23 +00002878PyObject *PyUnicode_EncodeASCII(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002879 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00002880 const char *errors)
2881{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002882 return unicode_encode_ucs1(p, size, errors, 128);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002883}
2884
2885PyObject *PyUnicode_AsASCIIString(PyObject *unicode)
2886{
2887 if (!PyUnicode_Check(unicode)) {
2888 PyErr_BadArgument();
2889 return NULL;
2890 }
2891 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
2892 PyUnicode_GET_SIZE(unicode),
2893 NULL);
2894}
2895
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00002896#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Guido van Rossum2ea3e142000-03-31 17:24:09 +00002897
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002898/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00002899
Thomas Wouters0e3f5912006-08-11 14:57:12 +00002900#if SIZEOF_INT < SIZEOF_SSIZE_T
2901#define NEED_RETRY
2902#endif
2903
2904/* XXX This code is limited to "true" double-byte encodings, as
2905 a) it assumes an incomplete character consists of a single byte, and
2906 b) IsDBCSLeadByte (probably) does not work for non-DBCS multi-byte
2907 encodings, see IsDBCSLeadByteEx documentation. */
2908
2909static int is_dbcs_lead_byte(const char *s, int offset)
2910{
2911 const char *curr = s + offset;
2912
2913 if (IsDBCSLeadByte(*curr)) {
2914 const char *prev = CharPrev(s, curr);
2915 return (prev == curr) || !IsDBCSLeadByte(*prev) || (curr - prev == 2);
2916 }
2917 return 0;
2918}
2919
2920/*
2921 * Decode MBCS string into unicode object. If 'final' is set, converts
2922 * trailing lead-byte too. Returns consumed size if succeed, -1 otherwise.
2923 */
2924static int decode_mbcs(PyUnicodeObject **v,
2925 const char *s, /* MBCS string */
2926 int size, /* sizeof MBCS string */
2927 int final)
2928{
2929 Py_UNICODE *p;
2930 Py_ssize_t n = 0;
2931 int usize = 0;
2932
2933 assert(size >= 0);
2934
2935 /* Skip trailing lead-byte unless 'final' is set */
2936 if (!final && size >= 1 && is_dbcs_lead_byte(s, size - 1))
2937 --size;
2938
2939 /* First get the size of the result */
2940 if (size > 0) {
2941 usize = MultiByteToWideChar(CP_ACP, 0, s, size, NULL, 0);
2942 if (usize == 0) {
2943 PyErr_SetFromWindowsErrWithFilename(0, NULL);
2944 return -1;
2945 }
2946 }
2947
2948 if (*v == NULL) {
2949 /* Create unicode object */
2950 *v = _PyUnicode_New(usize);
2951 if (*v == NULL)
2952 return -1;
2953 }
2954 else {
2955 /* Extend unicode object */
2956 n = PyUnicode_GET_SIZE(*v);
2957 if (_PyUnicode_Resize(v, n + usize) < 0)
2958 return -1;
2959 }
2960
2961 /* Do the conversion */
2962 if (size > 0) {
2963 p = PyUnicode_AS_UNICODE(*v) + n;
2964 if (0 == MultiByteToWideChar(CP_ACP, 0, s, size, p, usize)) {
2965 PyErr_SetFromWindowsErrWithFilename(0, NULL);
2966 return -1;
2967 }
2968 }
2969
2970 return size;
2971}
2972
2973PyObject *PyUnicode_DecodeMBCSStateful(const char *s,
2974 Py_ssize_t size,
2975 const char *errors,
2976 Py_ssize_t *consumed)
2977{
2978 PyUnicodeObject *v = NULL;
2979 int done;
2980
2981 if (consumed)
2982 *consumed = 0;
2983
2984#ifdef NEED_RETRY
2985 retry:
2986 if (size > INT_MAX)
2987 done = decode_mbcs(&v, s, INT_MAX, 0);
2988 else
2989#endif
2990 done = decode_mbcs(&v, s, (int)size, !consumed);
2991
2992 if (done < 0) {
2993 Py_XDECREF(v);
2994 return NULL;
2995 }
2996
2997 if (consumed)
2998 *consumed += done;
2999
3000#ifdef NEED_RETRY
3001 if (size > INT_MAX) {
3002 s += done;
3003 size -= done;
3004 goto retry;
3005 }
3006#endif
3007
3008 return (PyObject *)v;
3009}
3010
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003011PyObject *PyUnicode_DecodeMBCS(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003012 Py_ssize_t size,
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003013 const char *errors)
3014{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003015 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
3016}
3017
3018/*
3019 * Convert unicode into string object (MBCS).
3020 * Returns 0 if succeed, -1 otherwise.
3021 */
3022static int encode_mbcs(PyObject **repr,
3023 const Py_UNICODE *p, /* unicode */
3024 int size) /* size of unicode */
3025{
3026 int mbcssize = 0;
3027 Py_ssize_t n = 0;
3028
3029 assert(size >= 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003030
3031 /* First get the size of the result */
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003032 if (size > 0) {
3033 mbcssize = WideCharToMultiByte(CP_ACP, 0, p, size, NULL, 0, NULL, NULL);
3034 if (mbcssize == 0) {
3035 PyErr_SetFromWindowsErrWithFilename(0, NULL);
3036 return -1;
3037 }
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003038 }
3039
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003040 if (*repr == NULL) {
3041 /* Create string object */
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003042 *repr = PyBytes_FromStringAndSize(NULL, mbcssize);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003043 if (*repr == NULL)
3044 return -1;
3045 }
3046 else {
3047 /* Extend string object */
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003048 n = PyBytes_Size(*repr);
3049 if (PyBytes_Resize(*repr, n + mbcssize) < 0)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003050 return -1;
3051 }
3052
3053 /* Do the conversion */
3054 if (size > 0) {
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003055 char *s = PyBytes_AS_STRING(*repr) + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003056 if (0 == WideCharToMultiByte(CP_ACP, 0, p, size, s, mbcssize, NULL, NULL)) {
3057 PyErr_SetFromWindowsErrWithFilename(0, NULL);
3058 return -1;
3059 }
3060 }
3061
3062 return 0;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003063}
3064
3065PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003066 Py_ssize_t size,
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003067 const char *errors)
3068{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003069 PyObject *repr = NULL;
3070 int ret;
Guido van Rossum03e29f12000-05-04 15:52:20 +00003071
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003072#ifdef NEED_RETRY
3073 retry:
3074 if (size > INT_MAX)
3075 ret = encode_mbcs(&repr, p, INT_MAX);
3076 else
3077#endif
3078 ret = encode_mbcs(&repr, p, (int)size);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003079
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003080 if (ret < 0) {
3081 Py_XDECREF(repr);
3082 return NULL;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003083 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003084
3085#ifdef NEED_RETRY
3086 if (size > INT_MAX) {
3087 p += INT_MAX;
3088 size -= INT_MAX;
3089 goto retry;
3090 }
3091#endif
3092
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003093 return repr;
3094}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00003095
Mark Hammond0ccda1e2003-07-01 00:13:27 +00003096PyObject *PyUnicode_AsMBCSString(PyObject *unicode)
3097{
3098 if (!PyUnicode_Check(unicode)) {
3099 PyErr_BadArgument();
3100 return NULL;
3101 }
3102 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
3103 PyUnicode_GET_SIZE(unicode),
3104 NULL);
3105}
3106
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003107#undef NEED_RETRY
3108
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00003109#endif /* MS_WINDOWS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003110
Guido van Rossumd57fd912000-03-10 22:53:23 +00003111/* --- Character Mapping Codec -------------------------------------------- */
3112
Guido van Rossumd57fd912000-03-10 22:53:23 +00003113PyObject *PyUnicode_DecodeCharmap(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003114 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003115 PyObject *mapping,
3116 const char *errors)
3117{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003118 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003119 Py_ssize_t startinpos;
3120 Py_ssize_t endinpos;
3121 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003122 const char *e;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003123 PyUnicodeObject *v;
3124 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003125 Py_ssize_t extrachars = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003126 PyObject *errorHandler = NULL;
3127 PyObject *exc = NULL;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003128 Py_UNICODE *mapstring = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003129 Py_ssize_t maplen = 0;
Tim Petersced69f82003-09-16 20:30:58 +00003130
Guido van Rossumd57fd912000-03-10 22:53:23 +00003131 /* Default to Latin-1 */
3132 if (mapping == NULL)
3133 return PyUnicode_DecodeLatin1(s, size, errors);
3134
3135 v = _PyUnicode_New(size);
3136 if (v == NULL)
3137 goto onError;
3138 if (size == 0)
3139 return (PyObject *)v;
3140 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003141 e = s + size;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003142 if (PyUnicode_CheckExact(mapping)) {
3143 mapstring = PyUnicode_AS_UNICODE(mapping);
3144 maplen = PyUnicode_GET_SIZE(mapping);
3145 while (s < e) {
3146 unsigned char ch = *s;
3147 Py_UNICODE x = 0xfffe; /* illegal value */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003148
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003149 if (ch < maplen)
3150 x = mapstring[ch];
Guido van Rossumd57fd912000-03-10 22:53:23 +00003151
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003152 if (x == 0xfffe) {
3153 /* undefined mapping */
3154 outpos = p-PyUnicode_AS_UNICODE(v);
3155 startinpos = s-starts;
3156 endinpos = startinpos+1;
3157 if (unicode_decode_call_errorhandler(
3158 errors, &errorHandler,
3159 "charmap", "character maps to <undefined>",
3160 starts, size, &startinpos, &endinpos, &exc, &s,
3161 (PyObject **)&v, &outpos, &p)) {
3162 goto onError;
Marc-André Lemburgec233e52001-01-06 14:59:58 +00003163 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003164 continue;
Marc-André Lemburgec233e52001-01-06 14:59:58 +00003165 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003166 *p++ = x;
3167 ++s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003168 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003169 }
3170 else {
3171 while (s < e) {
3172 unsigned char ch = *s;
3173 PyObject *w, *x;
3174
3175 /* Get mapping (char ordinal -> integer, Unicode char or None) */
3176 w = PyInt_FromLong((long)ch);
3177 if (w == NULL)
3178 goto onError;
3179 x = PyObject_GetItem(mapping, w);
3180 Py_DECREF(w);
3181 if (x == NULL) {
3182 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
3183 /* No mapping found means: mapping is undefined. */
3184 PyErr_Clear();
3185 x = Py_None;
3186 Py_INCREF(x);
3187 } else
3188 goto onError;
3189 }
3190
3191 /* Apply mapping */
3192 if (PyInt_Check(x)) {
3193 long value = PyInt_AS_LONG(x);
3194 if (value < 0 || value > 65535) {
3195 PyErr_SetString(PyExc_TypeError,
3196 "character mapping must be in range(65536)");
3197 Py_DECREF(x);
3198 goto onError;
3199 }
3200 *p++ = (Py_UNICODE)value;
3201 }
3202 else if (x == Py_None) {
3203 /* undefined mapping */
3204 outpos = p-PyUnicode_AS_UNICODE(v);
3205 startinpos = s-starts;
3206 endinpos = startinpos+1;
3207 if (unicode_decode_call_errorhandler(
3208 errors, &errorHandler,
3209 "charmap", "character maps to <undefined>",
3210 starts, size, &startinpos, &endinpos, &exc, &s,
3211 (PyObject **)&v, &outpos, &p)) {
3212 Py_DECREF(x);
3213 goto onError;
3214 }
Walter Dörwaldd4fff172005-11-28 22:15:56 +00003215 Py_DECREF(x);
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003216 continue;
3217 }
3218 else if (PyUnicode_Check(x)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00003219 Py_ssize_t targetsize = PyUnicode_GET_SIZE(x);
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003220
3221 if (targetsize == 1)
3222 /* 1-1 mapping */
3223 *p++ = *PyUnicode_AS_UNICODE(x);
3224
3225 else if (targetsize > 1) {
3226 /* 1-n mapping */
3227 if (targetsize > extrachars) {
3228 /* resize first */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003229 Py_ssize_t oldpos = p - PyUnicode_AS_UNICODE(v);
3230 Py_ssize_t needed = (targetsize - extrachars) + \
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003231 (targetsize << 2);
3232 extrachars += needed;
Thomas Wouters89f507f2006-12-13 04:49:30 +00003233 /* XXX overflow detection missing */
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003234 if (_PyUnicode_Resize(&v,
3235 PyUnicode_GET_SIZE(v) + needed) < 0) {
3236 Py_DECREF(x);
3237 goto onError;
3238 }
3239 p = PyUnicode_AS_UNICODE(v) + oldpos;
3240 }
3241 Py_UNICODE_COPY(p,
3242 PyUnicode_AS_UNICODE(x),
3243 targetsize);
3244 p += targetsize;
3245 extrachars -= targetsize;
3246 }
3247 /* 1-0 mapping: skip the character */
3248 }
3249 else {
3250 /* wrong return value */
3251 PyErr_SetString(PyExc_TypeError,
3252 "character mapping must return integer, None or unicode");
3253 Py_DECREF(x);
3254 goto onError;
3255 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003256 Py_DECREF(x);
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003257 ++s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003258 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003259 }
3260 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003261 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003262 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003263 Py_XDECREF(errorHandler);
3264 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003265 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00003266
Guido van Rossumd57fd912000-03-10 22:53:23 +00003267 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003268 Py_XDECREF(errorHandler);
3269 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003270 Py_XDECREF(v);
3271 return NULL;
3272}
3273
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003274/* Charmap encoding: the lookup table */
3275
3276struct encoding_map{
3277 PyObject_HEAD
3278 unsigned char level1[32];
3279 int count2, count3;
3280 unsigned char level23[1];
3281};
3282
3283static PyObject*
3284encoding_map_size(PyObject *obj, PyObject* args)
3285{
3286 struct encoding_map *map = (struct encoding_map*)obj;
3287 return PyInt_FromLong(sizeof(*map) - 1 + 16*map->count2 +
3288 128*map->count3);
3289}
3290
3291static PyMethodDef encoding_map_methods[] = {
3292 {"size", encoding_map_size, METH_NOARGS,
3293 PyDoc_STR("Return the size (in bytes) of this object") },
3294 { 0 }
3295};
3296
3297static void
3298encoding_map_dealloc(PyObject* o)
3299{
3300 PyObject_FREE(o);
3301}
3302
3303static PyTypeObject EncodingMapType = {
3304 PyObject_HEAD_INIT(NULL)
3305 0, /*ob_size*/
3306 "EncodingMap", /*tp_name*/
3307 sizeof(struct encoding_map), /*tp_basicsize*/
3308 0, /*tp_itemsize*/
3309 /* methods */
3310 encoding_map_dealloc, /*tp_dealloc*/
3311 0, /*tp_print*/
3312 0, /*tp_getattr*/
3313 0, /*tp_setattr*/
3314 0, /*tp_compare*/
3315 0, /*tp_repr*/
3316 0, /*tp_as_number*/
3317 0, /*tp_as_sequence*/
3318 0, /*tp_as_mapping*/
3319 0, /*tp_hash*/
3320 0, /*tp_call*/
3321 0, /*tp_str*/
3322 0, /*tp_getattro*/
3323 0, /*tp_setattro*/
3324 0, /*tp_as_buffer*/
3325 Py_TPFLAGS_DEFAULT, /*tp_flags*/
3326 0, /*tp_doc*/
3327 0, /*tp_traverse*/
3328 0, /*tp_clear*/
3329 0, /*tp_richcompare*/
3330 0, /*tp_weaklistoffset*/
3331 0, /*tp_iter*/
3332 0, /*tp_iternext*/
3333 encoding_map_methods, /*tp_methods*/
3334 0, /*tp_members*/
3335 0, /*tp_getset*/
3336 0, /*tp_base*/
3337 0, /*tp_dict*/
3338 0, /*tp_descr_get*/
3339 0, /*tp_descr_set*/
3340 0, /*tp_dictoffset*/
3341 0, /*tp_init*/
3342 0, /*tp_alloc*/
3343 0, /*tp_new*/
3344 0, /*tp_free*/
3345 0, /*tp_is_gc*/
3346};
3347
3348PyObject*
3349PyUnicode_BuildEncodingMap(PyObject* string)
3350{
3351 Py_UNICODE *decode;
3352 PyObject *result;
3353 struct encoding_map *mresult;
3354 int i;
3355 int need_dict = 0;
3356 unsigned char level1[32];
3357 unsigned char level2[512];
3358 unsigned char *mlevel1, *mlevel2, *mlevel3;
3359 int count2 = 0, count3 = 0;
3360
3361 if (!PyUnicode_Check(string) || PyUnicode_GetSize(string) != 256) {
3362 PyErr_BadArgument();
3363 return NULL;
3364 }
3365 decode = PyUnicode_AS_UNICODE(string);
3366 memset(level1, 0xFF, sizeof level1);
3367 memset(level2, 0xFF, sizeof level2);
3368
3369 /* If there isn't a one-to-one mapping of NULL to \0,
3370 or if there are non-BMP characters, we need to use
3371 a mapping dictionary. */
3372 if (decode[0] != 0)
3373 need_dict = 1;
3374 for (i = 1; i < 256; i++) {
3375 int l1, l2;
3376 if (decode[i] == 0
3377 #ifdef Py_UNICODE_WIDE
3378 || decode[i] > 0xFFFF
3379 #endif
3380 ) {
3381 need_dict = 1;
3382 break;
3383 }
3384 if (decode[i] == 0xFFFE)
3385 /* unmapped character */
3386 continue;
3387 l1 = decode[i] >> 11;
3388 l2 = decode[i] >> 7;
3389 if (level1[l1] == 0xFF)
3390 level1[l1] = count2++;
3391 if (level2[l2] == 0xFF)
3392 level2[l2] = count3++;
3393 }
3394
3395 if (count2 >= 0xFF || count3 >= 0xFF)
3396 need_dict = 1;
3397
3398 if (need_dict) {
3399 PyObject *result = PyDict_New();
3400 PyObject *key, *value;
3401 if (!result)
3402 return NULL;
3403 for (i = 0; i < 256; i++) {
3404 key = value = NULL;
3405 key = PyInt_FromLong(decode[i]);
3406 value = PyInt_FromLong(i);
3407 if (!key || !value)
3408 goto failed1;
3409 if (PyDict_SetItem(result, key, value) == -1)
3410 goto failed1;
3411 Py_DECREF(key);
3412 Py_DECREF(value);
3413 }
3414 return result;
3415 failed1:
3416 Py_XDECREF(key);
3417 Py_XDECREF(value);
3418 Py_DECREF(result);
3419 return NULL;
3420 }
3421
3422 /* Create a three-level trie */
3423 result = PyObject_MALLOC(sizeof(struct encoding_map) +
3424 16*count2 + 128*count3 - 1);
3425 if (!result)
3426 return PyErr_NoMemory();
3427 PyObject_Init(result, &EncodingMapType);
3428 mresult = (struct encoding_map*)result;
3429 mresult->count2 = count2;
3430 mresult->count3 = count3;
3431 mlevel1 = mresult->level1;
3432 mlevel2 = mresult->level23;
3433 mlevel3 = mresult->level23 + 16*count2;
3434 memcpy(mlevel1, level1, 32);
3435 memset(mlevel2, 0xFF, 16*count2);
3436 memset(mlevel3, 0, 128*count3);
3437 count3 = 0;
3438 for (i = 1; i < 256; i++) {
3439 int o1, o2, o3, i2, i3;
3440 if (decode[i] == 0xFFFE)
3441 /* unmapped character */
3442 continue;
3443 o1 = decode[i]>>11;
3444 o2 = (decode[i]>>7) & 0xF;
3445 i2 = 16*mlevel1[o1] + o2;
3446 if (mlevel2[i2] == 0xFF)
3447 mlevel2[i2] = count3++;
3448 o3 = decode[i] & 0x7F;
3449 i3 = 128*mlevel2[i2] + o3;
3450 mlevel3[i3] = i;
3451 }
3452 return result;
3453}
3454
3455static int
3456encoding_map_lookup(Py_UNICODE c, PyObject *mapping)
3457{
3458 struct encoding_map *map = (struct encoding_map*)mapping;
3459 int l1 = c>>11;
3460 int l2 = (c>>7) & 0xF;
3461 int l3 = c & 0x7F;
3462 int i;
3463
3464#ifdef Py_UNICODE_WIDE
3465 if (c > 0xFFFF) {
3466 return -1;
3467 }
3468#endif
3469 if (c == 0)
3470 return 0;
3471 /* level 1*/
3472 i = map->level1[l1];
3473 if (i == 0xFF) {
3474 return -1;
3475 }
3476 /* level 2*/
3477 i = map->level23[16*i+l2];
3478 if (i == 0xFF) {
3479 return -1;
3480 }
3481 /* level 3 */
3482 i = map->level23[16*map->count2 + 128*i + l3];
3483 if (i == 0) {
3484 return -1;
3485 }
3486 return i;
3487}
3488
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003489/* Lookup the character ch in the mapping. If the character
3490 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00003491 error occurred). */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003492static PyObject *charmapencode_lookup(Py_UNICODE c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003493{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003494 PyObject *w = PyInt_FromLong((long)c);
3495 PyObject *x;
3496
3497 if (w == NULL)
3498 return NULL;
3499 x = PyObject_GetItem(mapping, w);
3500 Py_DECREF(w);
3501 if (x == NULL) {
3502 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
3503 /* No mapping found means: mapping is undefined. */
3504 PyErr_Clear();
3505 x = Py_None;
3506 Py_INCREF(x);
3507 return x;
3508 } else
3509 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003510 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00003511 else if (x == Py_None)
3512 return x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003513 else if (PyInt_Check(x)) {
3514 long value = PyInt_AS_LONG(x);
3515 if (value < 0 || value > 255) {
3516 PyErr_SetString(PyExc_TypeError,
3517 "character mapping must be in range(256)");
3518 Py_DECREF(x);
3519 return NULL;
3520 }
3521 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003522 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003523 else if (PyString_Check(x))
3524 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003525 else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003526 /* wrong return value */
3527 PyErr_SetString(PyExc_TypeError,
3528 "character mapping must return integer, None or str");
3529 Py_DECREF(x);
3530 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003531 }
3532}
3533
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003534static int
3535charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
3536{
3537 Py_ssize_t outsize = PyString_GET_SIZE(*outobj);
3538 /* exponentially overallocate to minimize reallocations */
3539 if (requiredsize < 2*outsize)
3540 requiredsize = 2*outsize;
3541 if (_PyString_Resize(outobj, requiredsize)) {
3542 return 0;
3543 }
3544 return 1;
3545}
3546
3547typedef enum charmapencode_result {
3548 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
3549}charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003550/* lookup the character, put the result in the output string and adjust
3551 various state variables. Reallocate the output string if not enough
3552 space is available. Return a new reference to the object that
3553 was put in the output buffer, or Py_None, if the mapping was undefined
3554 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00003555 reallocation error occurred. The caller must decref the result */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003556static
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003557charmapencode_result charmapencode_output(Py_UNICODE c, PyObject *mapping,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003558 PyObject **outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003559{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003560 PyObject *rep;
3561 char *outstart;
3562 Py_ssize_t outsize = PyString_GET_SIZE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003563
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003564 if (mapping->ob_type == &EncodingMapType) {
3565 int res = encoding_map_lookup(c, mapping);
3566 Py_ssize_t requiredsize = *outpos+1;
3567 if (res == -1)
3568 return enc_FAILED;
3569 if (outsize<requiredsize)
3570 if (!charmapencode_resize(outobj, outpos, requiredsize))
3571 return enc_EXCEPTION;
3572 outstart = PyString_AS_STRING(*outobj);
3573 outstart[(*outpos)++] = (char)res;
3574 return enc_SUCCESS;
3575 }
3576
3577 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003578 if (rep==NULL)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003579 return enc_EXCEPTION;
3580 else if (rep==Py_None) {
3581 Py_DECREF(rep);
3582 return enc_FAILED;
3583 } else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003584 if (PyInt_Check(rep)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00003585 Py_ssize_t requiredsize = *outpos+1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003586 if (outsize<requiredsize)
3587 if (!charmapencode_resize(outobj, outpos, requiredsize)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003588 Py_DECREF(rep);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003589 return enc_EXCEPTION;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003590 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003591 outstart = PyString_AS_STRING(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003592 outstart[(*outpos)++] = (char)PyInt_AS_LONG(rep);
3593 }
3594 else {
3595 const char *repchars = PyString_AS_STRING(rep);
Martin v. Löwis18e16552006-02-15 17:27:45 +00003596 Py_ssize_t repsize = PyString_GET_SIZE(rep);
3597 Py_ssize_t requiredsize = *outpos+repsize;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003598 if (outsize<requiredsize)
3599 if (!charmapencode_resize(outobj, outpos, requiredsize)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003600 Py_DECREF(rep);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003601 return enc_EXCEPTION;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003602 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003603 outstart = PyString_AS_STRING(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003604 memcpy(outstart + *outpos, repchars, repsize);
3605 *outpos += repsize;
3606 }
3607 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003608 Py_DECREF(rep);
3609 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003610}
3611
3612/* handle an error in PyUnicode_EncodeCharmap
3613 Return 0 on success, -1 on error */
3614static
3615int charmap_encoding_error(
Martin v. Löwis18e16552006-02-15 17:27:45 +00003616 const Py_UNICODE *p, Py_ssize_t size, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003617 PyObject **exceptionObject,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00003618 int *known_errorHandler, PyObject **errorHandler, const char *errors,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003619 PyObject **res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003620{
3621 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003622 Py_ssize_t repsize;
3623 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003624 Py_UNICODE *uni2;
3625 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003626 Py_ssize_t collstartpos = *inpos;
3627 Py_ssize_t collendpos = *inpos+1;
3628 Py_ssize_t collpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003629 char *encoding = "charmap";
3630 char *reason = "character maps to <undefined>";
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003631 charmapencode_result x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003632
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003633 /* find all unencodable characters */
3634 while (collendpos < size) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003635 PyObject *rep;
3636 if (mapping->ob_type == &EncodingMapType) {
3637 int res = encoding_map_lookup(p[collendpos], mapping);
3638 if (res != -1)
3639 break;
3640 ++collendpos;
3641 continue;
3642 }
3643
3644 rep = charmapencode_lookup(p[collendpos], mapping);
3645 if (rep==NULL)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003646 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003647 else if (rep!=Py_None) {
3648 Py_DECREF(rep);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003649 break;
3650 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003651 Py_DECREF(rep);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003652 ++collendpos;
3653 }
3654 /* cache callback name lookup
3655 * (if not done yet, i.e. it's the first error) */
3656 if (*known_errorHandler==-1) {
3657 if ((errors==NULL) || (!strcmp(errors, "strict")))
3658 *known_errorHandler = 1;
3659 else if (!strcmp(errors, "replace"))
3660 *known_errorHandler = 2;
3661 else if (!strcmp(errors, "ignore"))
3662 *known_errorHandler = 3;
3663 else if (!strcmp(errors, "xmlcharrefreplace"))
3664 *known_errorHandler = 4;
3665 else
3666 *known_errorHandler = 0;
3667 }
3668 switch (*known_errorHandler) {
3669 case 1: /* strict */
3670 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
3671 return -1;
3672 case 2: /* replace */
3673 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
3674 x = charmapencode_output('?', mapping, res, respos);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003675 if (x==enc_EXCEPTION) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003676 return -1;
3677 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003678 else if (x==enc_FAILED) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003679 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
3680 return -1;
3681 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003682 }
3683 /* fall through */
3684 case 3: /* ignore */
3685 *inpos = collendpos;
3686 break;
3687 case 4: /* xmlcharrefreplace */
3688 /* generate replacement (temporarily (mis)uses p) */
3689 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
3690 char buffer[2+29+1+1];
3691 char *cp;
3692 sprintf(buffer, "&#%d;", (int)p[collpos]);
3693 for (cp = buffer; *cp; ++cp) {
3694 x = charmapencode_output(*cp, mapping, res, respos);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003695 if (x==enc_EXCEPTION)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003696 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003697 else if (x==enc_FAILED) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003698 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
3699 return -1;
3700 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003701 }
3702 }
3703 *inpos = collendpos;
3704 break;
3705 default:
Walter Dörwalde5402fb2003-08-14 20:25:29 +00003706 repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003707 encoding, reason, p, size, exceptionObject,
3708 collstartpos, collendpos, &newpos);
3709 if (repunicode == NULL)
3710 return -1;
3711 /* generate replacement */
3712 repsize = PyUnicode_GET_SIZE(repunicode);
3713 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
3714 x = charmapencode_output(*uni2, mapping, res, respos);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003715 if (x==enc_EXCEPTION) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003716 return -1;
3717 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003718 else if (x==enc_FAILED) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003719 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003720 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
3721 return -1;
3722 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003723 }
3724 *inpos = newpos;
3725 Py_DECREF(repunicode);
3726 }
3727 return 0;
3728}
3729
Guido van Rossumd57fd912000-03-10 22:53:23 +00003730PyObject *PyUnicode_EncodeCharmap(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003731 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003732 PyObject *mapping,
3733 const char *errors)
3734{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003735 /* output object */
3736 PyObject *res = NULL;
3737 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003738 Py_ssize_t inpos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003739 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003740 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003741 PyObject *errorHandler = NULL;
3742 PyObject *exc = NULL;
3743 /* the following variable is used for caching string comparisons
3744 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
3745 * 3=ignore, 4=xmlcharrefreplace */
3746 int known_errorHandler = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003747
3748 /* Default to Latin-1 */
3749 if (mapping == NULL)
3750 return PyUnicode_EncodeLatin1(p, size, errors);
3751
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003752 /* allocate enough for a simple encoding without
3753 replacements, if we need more, we'll resize */
3754 res = PyString_FromStringAndSize(NULL, size);
3755 if (res == NULL)
3756 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00003757 if (size == 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003758 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003759
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003760 while (inpos<size) {
3761 /* try to encode it */
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003762 charmapencode_result x = charmapencode_output(p[inpos], mapping, &res, &respos);
3763 if (x==enc_EXCEPTION) /* error */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003764 goto onError;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003765 if (x==enc_FAILED) { /* unencodable character */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003766 if (charmap_encoding_error(p, size, &inpos, mapping,
3767 &exc,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00003768 &known_errorHandler, &errorHandler, errors,
Walter Dörwald9b30f202003-08-15 16:26:34 +00003769 &res, &respos)) {
Marc-André Lemburg3a645e42001-01-16 11:54:12 +00003770 goto onError;
Walter Dörwald9b30f202003-08-15 16:26:34 +00003771 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003772 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003773 else
3774 /* done with this character => adjust input position */
3775 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003776 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003777
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003778 /* Resize if we allocated to much */
3779 if (respos<PyString_GET_SIZE(res)) {
3780 if (_PyString_Resize(&res, respos))
3781 goto onError;
3782 }
3783 Py_XDECREF(exc);
3784 Py_XDECREF(errorHandler);
3785 return res;
3786
3787 onError:
3788 Py_XDECREF(res);
3789 Py_XDECREF(exc);
3790 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003791 return NULL;
3792}
3793
3794PyObject *PyUnicode_AsCharmapString(PyObject *unicode,
3795 PyObject *mapping)
3796{
3797 if (!PyUnicode_Check(unicode) || mapping == NULL) {
3798 PyErr_BadArgument();
3799 return NULL;
3800 }
3801 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
3802 PyUnicode_GET_SIZE(unicode),
3803 mapping,
3804 NULL);
3805}
3806
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003807/* create or adjust a UnicodeTranslateError */
3808static void make_translate_exception(PyObject **exceptionObject,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003809 const Py_UNICODE *unicode, Py_ssize_t size,
3810 Py_ssize_t startpos, Py_ssize_t endpos,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003811 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003812{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003813 if (*exceptionObject == NULL) {
3814 *exceptionObject = PyUnicodeTranslateError_Create(
3815 unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003816 }
3817 else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003818 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
3819 goto onError;
3820 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
3821 goto onError;
3822 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
3823 goto onError;
3824 return;
3825 onError:
3826 Py_DECREF(*exceptionObject);
3827 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003828 }
3829}
3830
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003831/* raises a UnicodeTranslateError */
3832static void raise_translate_exception(PyObject **exceptionObject,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003833 const Py_UNICODE *unicode, Py_ssize_t size,
3834 Py_ssize_t startpos, Py_ssize_t endpos,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003835 const char *reason)
3836{
3837 make_translate_exception(exceptionObject,
3838 unicode, size, startpos, endpos, reason);
3839 if (*exceptionObject != NULL)
3840 PyCodec_StrictErrors(*exceptionObject);
3841}
3842
3843/* error handling callback helper:
3844 build arguments, call the callback and check the arguments,
3845 put the result into newpos and return the replacement string, which
3846 has to be freed by the caller */
3847static PyObject *unicode_translate_call_errorhandler(const char *errors,
3848 PyObject **errorHandler,
3849 const char *reason,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003850 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
3851 Py_ssize_t startpos, Py_ssize_t endpos,
3852 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003853{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003854 static char *argparse = "O!n;translating error handler must return (unicode, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003855
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003856 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003857 PyObject *restuple;
3858 PyObject *resunicode;
3859
3860 if (*errorHandler == NULL) {
3861 *errorHandler = PyCodec_LookupError(errors);
3862 if (*errorHandler == NULL)
3863 return NULL;
3864 }
3865
3866 make_translate_exception(exceptionObject,
3867 unicode, size, startpos, endpos, reason);
3868 if (*exceptionObject == NULL)
3869 return NULL;
3870
3871 restuple = PyObject_CallFunctionObjArgs(
3872 *errorHandler, *exceptionObject, NULL);
3873 if (restuple == NULL)
3874 return NULL;
3875 if (!PyTuple_Check(restuple)) {
3876 PyErr_Format(PyExc_TypeError, &argparse[4]);
3877 Py_DECREF(restuple);
3878 return NULL;
3879 }
3880 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003881 &resunicode, &i_newpos)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003882 Py_DECREF(restuple);
3883 return NULL;
3884 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00003885 if (i_newpos<0)
3886 *newpos = size+i_newpos;
3887 else
3888 *newpos = i_newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003889 if (*newpos<0 || *newpos>size) {
Martin v. Löwis2c95cc62006-02-16 06:54:25 +00003890 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003891 Py_DECREF(restuple);
3892 return NULL;
3893 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003894 Py_INCREF(resunicode);
3895 Py_DECREF(restuple);
3896 return resunicode;
3897}
3898
3899/* Lookup the character ch in the mapping and put the result in result,
3900 which must be decrefed by the caller.
3901 Return 0 on success, -1 on error */
3902static
3903int charmaptranslate_lookup(Py_UNICODE c, PyObject *mapping, PyObject **result)
3904{
3905 PyObject *w = PyInt_FromLong((long)c);
3906 PyObject *x;
3907
3908 if (w == NULL)
3909 return -1;
3910 x = PyObject_GetItem(mapping, w);
3911 Py_DECREF(w);
3912 if (x == NULL) {
3913 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
3914 /* No mapping found means: use 1:1 mapping. */
3915 PyErr_Clear();
3916 *result = NULL;
3917 return 0;
3918 } else
3919 return -1;
3920 }
3921 else if (x == Py_None) {
3922 *result = x;
3923 return 0;
3924 }
3925 else if (PyInt_Check(x)) {
3926 long value = PyInt_AS_LONG(x);
3927 long max = PyUnicode_GetMax();
3928 if (value < 0 || value > max) {
3929 PyErr_Format(PyExc_TypeError,
3930 "character mapping must be in range(0x%lx)", max+1);
3931 Py_DECREF(x);
3932 return -1;
3933 }
3934 *result = x;
3935 return 0;
3936 }
3937 else if (PyUnicode_Check(x)) {
3938 *result = x;
3939 return 0;
3940 }
3941 else {
3942 /* wrong return value */
3943 PyErr_SetString(PyExc_TypeError,
3944 "character mapping must return integer, None or unicode");
Walter Dörwald150523e2003-08-15 16:52:19 +00003945 Py_DECREF(x);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003946 return -1;
3947 }
3948}
3949/* ensure that *outobj is at least requiredsize characters long,
3950if not reallocate and adjust various state variables.
3951Return 0 on success, -1 on error */
3952static
Walter Dörwald4894c302003-10-24 14:25:28 +00003953int charmaptranslate_makespace(PyObject **outobj, Py_UNICODE **outp,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003954 Py_ssize_t requiredsize)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003955{
Martin v. Löwis18e16552006-02-15 17:27:45 +00003956 Py_ssize_t oldsize = PyUnicode_GET_SIZE(*outobj);
Walter Dörwald4894c302003-10-24 14:25:28 +00003957 if (requiredsize > oldsize) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003958 /* remember old output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003959 Py_ssize_t outpos = *outp-PyUnicode_AS_UNICODE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003960 /* exponentially overallocate to minimize reallocations */
Walter Dörwald4894c302003-10-24 14:25:28 +00003961 if (requiredsize < 2 * oldsize)
3962 requiredsize = 2 * oldsize;
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00003963 if (_PyUnicode_Resize(outobj, requiredsize) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003964 return -1;
3965 *outp = PyUnicode_AS_UNICODE(*outobj) + outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003966 }
3967 return 0;
3968}
3969/* lookup the character, put the result in the output string and adjust
3970 various state variables. Return a new reference to the object that
3971 was put in the output buffer in *result, or Py_None, if the mapping was
3972 undefined (in which case no character was written).
3973 The called must decref result.
3974 Return 0 on success, -1 on error. */
3975static
Walter Dörwald4894c302003-10-24 14:25:28 +00003976int charmaptranslate_output(const Py_UNICODE *startinp, const Py_UNICODE *curinp,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003977 Py_ssize_t insize, PyObject *mapping, PyObject **outobj, Py_UNICODE **outp,
Walter Dörwald4894c302003-10-24 14:25:28 +00003978 PyObject **res)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003979{
Walter Dörwald4894c302003-10-24 14:25:28 +00003980 if (charmaptranslate_lookup(*curinp, mapping, res))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003981 return -1;
3982 if (*res==NULL) {
3983 /* not found => default to 1:1 mapping */
Walter Dörwald4894c302003-10-24 14:25:28 +00003984 *(*outp)++ = *curinp;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003985 }
3986 else if (*res==Py_None)
3987 ;
3988 else if (PyInt_Check(*res)) {
3989 /* no overflow check, because we know that the space is enough */
3990 *(*outp)++ = (Py_UNICODE)PyInt_AS_LONG(*res);
3991 }
3992 else if (PyUnicode_Check(*res)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00003993 Py_ssize_t repsize = PyUnicode_GET_SIZE(*res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003994 if (repsize==1) {
3995 /* no overflow check, because we know that the space is enough */
3996 *(*outp)++ = *PyUnicode_AS_UNICODE(*res);
3997 }
3998 else if (repsize!=0) {
3999 /* more than one character */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004000 Py_ssize_t requiredsize = (*outp-PyUnicode_AS_UNICODE(*outobj)) +
Walter Dörwaldcd736e72004-02-05 17:36:00 +00004001 (insize - (curinp-startinp)) +
Walter Dörwald4894c302003-10-24 14:25:28 +00004002 repsize - 1;
4003 if (charmaptranslate_makespace(outobj, outp, requiredsize))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004004 return -1;
4005 memcpy(*outp, PyUnicode_AS_UNICODE(*res), sizeof(Py_UNICODE)*repsize);
4006 *outp += repsize;
4007 }
4008 }
4009 else
4010 return -1;
4011 return 0;
4012}
4013
4014PyObject *PyUnicode_TranslateCharmap(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004015 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004016 PyObject *mapping,
4017 const char *errors)
4018{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004019 /* output object */
4020 PyObject *res = NULL;
4021 /* pointers to the beginning and end+1 of input */
4022 const Py_UNICODE *startp = p;
4023 const Py_UNICODE *endp = p + size;
4024 /* pointer into the output */
4025 Py_UNICODE *str;
4026 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004027 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004028 char *reason = "character maps to <undefined>";
4029 PyObject *errorHandler = NULL;
4030 PyObject *exc = NULL;
4031 /* the following variable is used for caching string comparisons
4032 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
4033 * 3=ignore, 4=xmlcharrefreplace */
4034 int known_errorHandler = -1;
4035
Guido van Rossumd57fd912000-03-10 22:53:23 +00004036 if (mapping == NULL) {
4037 PyErr_BadArgument();
4038 return NULL;
4039 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004040
4041 /* allocate enough for a simple 1:1 translation without
4042 replacements, if we need more, we'll resize */
4043 res = PyUnicode_FromUnicode(NULL, size);
4044 if (res == NULL)
Walter Dörwald4894c302003-10-24 14:25:28 +00004045 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004046 if (size == 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004047 return res;
4048 str = PyUnicode_AS_UNICODE(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004049
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004050 while (p<endp) {
4051 /* try to encode it */
4052 PyObject *x = NULL;
Walter Dörwald4894c302003-10-24 14:25:28 +00004053 if (charmaptranslate_output(startp, p, size, mapping, &res, &str, &x)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004054 Py_XDECREF(x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004055 goto onError;
4056 }
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00004057 Py_XDECREF(x);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004058 if (x!=Py_None) /* it worked => adjust input pointer */
4059 ++p;
4060 else { /* untranslatable character */
4061 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004062 Py_ssize_t repsize;
4063 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004064 Py_UNICODE *uni2;
4065 /* startpos for collecting untranslatable chars */
4066 const Py_UNICODE *collstart = p;
4067 const Py_UNICODE *collend = p+1;
4068 const Py_UNICODE *coll;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004069
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004070 /* find all untranslatable characters */
4071 while (collend < endp) {
Walter Dörwald4894c302003-10-24 14:25:28 +00004072 if (charmaptranslate_lookup(*collend, mapping, &x))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004073 goto onError;
4074 Py_XDECREF(x);
4075 if (x!=Py_None)
4076 break;
4077 ++collend;
4078 }
4079 /* cache callback name lookup
4080 * (if not done yet, i.e. it's the first error) */
4081 if (known_errorHandler==-1) {
4082 if ((errors==NULL) || (!strcmp(errors, "strict")))
4083 known_errorHandler = 1;
4084 else if (!strcmp(errors, "replace"))
4085 known_errorHandler = 2;
4086 else if (!strcmp(errors, "ignore"))
4087 known_errorHandler = 3;
4088 else if (!strcmp(errors, "xmlcharrefreplace"))
4089 known_errorHandler = 4;
4090 else
4091 known_errorHandler = 0;
4092 }
4093 switch (known_errorHandler) {
4094 case 1: /* strict */
4095 raise_translate_exception(&exc, startp, size, collstart-startp, collend-startp, reason);
4096 goto onError;
4097 case 2: /* replace */
4098 /* No need to check for space, this is a 1:1 replacement */
4099 for (coll = collstart; coll<collend; ++coll)
4100 *str++ = '?';
4101 /* fall through */
4102 case 3: /* ignore */
4103 p = collend;
4104 break;
4105 case 4: /* xmlcharrefreplace */
4106 /* generate replacement (temporarily (mis)uses p) */
4107 for (p = collstart; p < collend; ++p) {
4108 char buffer[2+29+1+1];
4109 char *cp;
4110 sprintf(buffer, "&#%d;", (int)*p);
Walter Dörwald4894c302003-10-24 14:25:28 +00004111 if (charmaptranslate_makespace(&res, &str,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004112 (str-PyUnicode_AS_UNICODE(res))+strlen(buffer)+(endp-collend)))
4113 goto onError;
4114 for (cp = buffer; *cp; ++cp)
4115 *str++ = *cp;
4116 }
4117 p = collend;
4118 break;
4119 default:
4120 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
4121 reason, startp, size, &exc,
4122 collstart-startp, collend-startp, &newpos);
4123 if (repunicode == NULL)
4124 goto onError;
4125 /* generate replacement */
4126 repsize = PyUnicode_GET_SIZE(repunicode);
Walter Dörwald4894c302003-10-24 14:25:28 +00004127 if (charmaptranslate_makespace(&res, &str,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004128 (str-PyUnicode_AS_UNICODE(res))+repsize+(endp-collend))) {
4129 Py_DECREF(repunicode);
4130 goto onError;
4131 }
4132 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2)
4133 *str++ = *uni2;
4134 p = startp + newpos;
4135 Py_DECREF(repunicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004136 }
4137 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004138 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004139 /* Resize if we allocated to much */
4140 respos = str-PyUnicode_AS_UNICODE(res);
Walter Dörwald4894c302003-10-24 14:25:28 +00004141 if (respos<PyUnicode_GET_SIZE(res)) {
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00004142 if (_PyUnicode_Resize(&res, respos) < 0)
Guido van Rossumfd4b9572000-04-10 13:51:10 +00004143 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004144 }
4145 Py_XDECREF(exc);
4146 Py_XDECREF(errorHandler);
4147 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004148
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004149 onError:
4150 Py_XDECREF(res);
4151 Py_XDECREF(exc);
4152 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004153 return NULL;
4154}
4155
4156PyObject *PyUnicode_Translate(PyObject *str,
4157 PyObject *mapping,
4158 const char *errors)
4159{
4160 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00004161
Guido van Rossumd57fd912000-03-10 22:53:23 +00004162 str = PyUnicode_FromObject(str);
4163 if (str == NULL)
4164 goto onError;
4165 result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str),
4166 PyUnicode_GET_SIZE(str),
4167 mapping,
4168 errors);
4169 Py_DECREF(str);
4170 return result;
Tim Petersced69f82003-09-16 20:30:58 +00004171
Guido van Rossumd57fd912000-03-10 22:53:23 +00004172 onError:
4173 Py_XDECREF(str);
4174 return NULL;
4175}
Tim Petersced69f82003-09-16 20:30:58 +00004176
Guido van Rossum9e896b32000-04-05 20:11:21 +00004177/* --- Decimal Encoder ---------------------------------------------------- */
4178
4179int PyUnicode_EncodeDecimal(Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004180 Py_ssize_t length,
Guido van Rossum9e896b32000-04-05 20:11:21 +00004181 char *output,
4182 const char *errors)
4183{
4184 Py_UNICODE *p, *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004185 PyObject *errorHandler = NULL;
4186 PyObject *exc = NULL;
4187 const char *encoding = "decimal";
4188 const char *reason = "invalid decimal Unicode string";
4189 /* the following variable is used for caching string comparisons
4190 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
4191 int known_errorHandler = -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00004192
4193 if (output == NULL) {
4194 PyErr_BadArgument();
4195 return -1;
4196 }
4197
4198 p = s;
4199 end = s + length;
4200 while (p < end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004201 register Py_UNICODE ch = *p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00004202 int decimal;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004203 PyObject *repunicode;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004204 Py_ssize_t repsize;
4205 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004206 Py_UNICODE *uni2;
4207 Py_UNICODE *collstart;
4208 Py_UNICODE *collend;
Tim Petersced69f82003-09-16 20:30:58 +00004209
Guido van Rossum9e896b32000-04-05 20:11:21 +00004210 if (Py_UNICODE_ISSPACE(ch)) {
4211 *output++ = ' ';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004212 ++p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00004213 continue;
4214 }
4215 decimal = Py_UNICODE_TODECIMAL(ch);
4216 if (decimal >= 0) {
4217 *output++ = '0' + decimal;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004218 ++p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00004219 continue;
4220 }
Guido van Rossumba477042000-04-06 18:18:10 +00004221 if (0 < ch && ch < 256) {
Guido van Rossum42c29aa2000-05-03 23:58:29 +00004222 *output++ = (char)ch;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004223 ++p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00004224 continue;
4225 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004226 /* All other characters are considered unencodable */
4227 collstart = p;
4228 collend = p+1;
4229 while (collend < end) {
4230 if ((0 < *collend && *collend < 256) ||
4231 !Py_UNICODE_ISSPACE(*collend) ||
4232 Py_UNICODE_TODECIMAL(*collend))
4233 break;
Guido van Rossum9e896b32000-04-05 20:11:21 +00004234 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004235 /* cache callback name lookup
4236 * (if not done yet, i.e. it's the first error) */
4237 if (known_errorHandler==-1) {
4238 if ((errors==NULL) || (!strcmp(errors, "strict")))
4239 known_errorHandler = 1;
4240 else if (!strcmp(errors, "replace"))
4241 known_errorHandler = 2;
4242 else if (!strcmp(errors, "ignore"))
4243 known_errorHandler = 3;
4244 else if (!strcmp(errors, "xmlcharrefreplace"))
4245 known_errorHandler = 4;
4246 else
4247 known_errorHandler = 0;
4248 }
4249 switch (known_errorHandler) {
4250 case 1: /* strict */
4251 raise_encode_exception(&exc, encoding, s, length, collstart-s, collend-s, reason);
4252 goto onError;
4253 case 2: /* replace */
4254 for (p = collstart; p < collend; ++p)
4255 *output++ = '?';
4256 /* fall through */
4257 case 3: /* ignore */
4258 p = collend;
4259 break;
4260 case 4: /* xmlcharrefreplace */
4261 /* generate replacement (temporarily (mis)uses p) */
4262 for (p = collstart; p < collend; ++p)
4263 output += sprintf(output, "&#%d;", (int)*p);
4264 p = collend;
4265 break;
4266 default:
4267 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
4268 encoding, reason, s, length, &exc,
4269 collstart-s, collend-s, &newpos);
4270 if (repunicode == NULL)
4271 goto onError;
4272 /* generate replacement */
4273 repsize = PyUnicode_GET_SIZE(repunicode);
4274 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
4275 Py_UNICODE ch = *uni2;
4276 if (Py_UNICODE_ISSPACE(ch))
4277 *output++ = ' ';
4278 else {
4279 decimal = Py_UNICODE_TODECIMAL(ch);
4280 if (decimal >= 0)
4281 *output++ = '0' + decimal;
4282 else if (0 < ch && ch < 256)
4283 *output++ = (char)ch;
4284 else {
4285 Py_DECREF(repunicode);
4286 raise_encode_exception(&exc, encoding,
4287 s, length, collstart-s, collend-s, reason);
4288 goto onError;
4289 }
4290 }
4291 }
4292 p = s + newpos;
4293 Py_DECREF(repunicode);
Guido van Rossum9e896b32000-04-05 20:11:21 +00004294 }
4295 }
4296 /* 0-terminate the output string */
4297 *output++ = '\0';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004298 Py_XDECREF(exc);
4299 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00004300 return 0;
4301
4302 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004303 Py_XDECREF(exc);
4304 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00004305 return -1;
4306}
4307
Guido van Rossumd57fd912000-03-10 22:53:23 +00004308/* --- Helpers ------------------------------------------------------------ */
4309
Thomas Wouters477c8d52006-05-27 19:21:47 +00004310#define STRINGLIB_CHAR Py_UNICODE
4311
4312#define STRINGLIB_LEN PyUnicode_GET_SIZE
4313#define STRINGLIB_NEW PyUnicode_FromUnicode
4314#define STRINGLIB_STR PyUnicode_AS_UNICODE
4315
4316Py_LOCAL_INLINE(int)
4317STRINGLIB_CMP(const Py_UNICODE* str, const Py_UNICODE* other, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004318{
Thomas Wouters477c8d52006-05-27 19:21:47 +00004319 if (str[0] != other[0])
4320 return 1;
4321 return memcmp((void*) str, (void*) other, len * sizeof(Py_UNICODE));
Guido van Rossumd57fd912000-03-10 22:53:23 +00004322}
4323
Thomas Wouters477c8d52006-05-27 19:21:47 +00004324#define STRINGLIB_EMPTY unicode_empty
4325
4326#include "stringlib/fastsearch.h"
4327
4328#include "stringlib/count.h"
4329#include "stringlib/find.h"
4330#include "stringlib/partition.h"
4331
4332/* helper macro to fixup start/end slice values */
4333#define FIX_START_END(obj) \
4334 if (start < 0) \
4335 start += (obj)->length; \
4336 if (start < 0) \
4337 start = 0; \
4338 if (end > (obj)->length) \
4339 end = (obj)->length; \
4340 if (end < 0) \
4341 end += (obj)->length; \
4342 if (end < 0) \
4343 end = 0;
4344
Martin v. Löwis18e16552006-02-15 17:27:45 +00004345Py_ssize_t PyUnicode_Count(PyObject *str,
Thomas Wouters477c8d52006-05-27 19:21:47 +00004346 PyObject *substr,
4347 Py_ssize_t start,
4348 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004349{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004350 Py_ssize_t result;
Thomas Wouters477c8d52006-05-27 19:21:47 +00004351 PyUnicodeObject* str_obj;
4352 PyUnicodeObject* sub_obj;
Tim Petersced69f82003-09-16 20:30:58 +00004353
Thomas Wouters477c8d52006-05-27 19:21:47 +00004354 str_obj = (PyUnicodeObject*) PyUnicode_FromObject(str);
4355 if (!str_obj)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004356 return -1;
Thomas Wouters477c8d52006-05-27 19:21:47 +00004357 sub_obj = (PyUnicodeObject*) PyUnicode_FromObject(substr);
4358 if (!sub_obj) {
4359 Py_DECREF(str_obj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004360 return -1;
4361 }
Tim Petersced69f82003-09-16 20:30:58 +00004362
Thomas Wouters477c8d52006-05-27 19:21:47 +00004363 FIX_START_END(str_obj);
Tim Petersced69f82003-09-16 20:30:58 +00004364
Thomas Wouters477c8d52006-05-27 19:21:47 +00004365 result = stringlib_count(
4366 str_obj->str + start, end - start, sub_obj->str, sub_obj->length
4367 );
4368
4369 Py_DECREF(sub_obj);
4370 Py_DECREF(str_obj);
4371
Guido van Rossumd57fd912000-03-10 22:53:23 +00004372 return result;
4373}
4374
Martin v. Löwis18e16552006-02-15 17:27:45 +00004375Py_ssize_t PyUnicode_Find(PyObject *str,
Thomas Wouters477c8d52006-05-27 19:21:47 +00004376 PyObject *sub,
4377 Py_ssize_t start,
4378 Py_ssize_t end,
4379 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004380{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004381 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00004382
Guido van Rossumd57fd912000-03-10 22:53:23 +00004383 str = PyUnicode_FromObject(str);
Thomas Wouters477c8d52006-05-27 19:21:47 +00004384 if (!str)
Marc-André Lemburg4da6fd62002-05-29 11:33:13 +00004385 return -2;
Thomas Wouters477c8d52006-05-27 19:21:47 +00004386 sub = PyUnicode_FromObject(sub);
4387 if (!sub) {
Marc-André Lemburg41644392002-05-29 13:46:29 +00004388 Py_DECREF(str);
Marc-André Lemburg4da6fd62002-05-29 11:33:13 +00004389 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004390 }
Tim Petersced69f82003-09-16 20:30:58 +00004391
Thomas Wouters477c8d52006-05-27 19:21:47 +00004392 if (direction > 0)
4393 result = stringlib_find_slice(
4394 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
4395 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
4396 start, end
4397 );
4398 else
4399 result = stringlib_rfind_slice(
4400 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
4401 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
4402 start, end
4403 );
4404
Guido van Rossumd57fd912000-03-10 22:53:23 +00004405 Py_DECREF(str);
Thomas Wouters477c8d52006-05-27 19:21:47 +00004406 Py_DECREF(sub);
4407
Guido van Rossumd57fd912000-03-10 22:53:23 +00004408 return result;
4409}
4410
Tim Petersced69f82003-09-16 20:30:58 +00004411static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004412int tailmatch(PyUnicodeObject *self,
4413 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004414 Py_ssize_t start,
4415 Py_ssize_t end,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004416 int direction)
4417{
Guido van Rossumd57fd912000-03-10 22:53:23 +00004418 if (substring->length == 0)
4419 return 1;
4420
Thomas Wouters477c8d52006-05-27 19:21:47 +00004421 FIX_START_END(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004422
4423 end -= substring->length;
4424 if (end < start)
4425 return 0;
4426
4427 if (direction > 0) {
4428 if (Py_UNICODE_MATCH(self, end, substring))
4429 return 1;
4430 } else {
4431 if (Py_UNICODE_MATCH(self, start, substring))
4432 return 1;
4433 }
4434
4435 return 0;
4436}
4437
Martin v. Löwis18e16552006-02-15 17:27:45 +00004438Py_ssize_t PyUnicode_Tailmatch(PyObject *str,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004439 PyObject *substr,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004440 Py_ssize_t start,
4441 Py_ssize_t end,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004442 int direction)
4443{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004444 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00004445
Guido van Rossumd57fd912000-03-10 22:53:23 +00004446 str = PyUnicode_FromObject(str);
4447 if (str == NULL)
4448 return -1;
4449 substr = PyUnicode_FromObject(substr);
4450 if (substr == NULL) {
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00004451 Py_DECREF(str);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004452 return -1;
4453 }
Tim Petersced69f82003-09-16 20:30:58 +00004454
Guido van Rossumd57fd912000-03-10 22:53:23 +00004455 result = tailmatch((PyUnicodeObject *)str,
4456 (PyUnicodeObject *)substr,
4457 start, end, direction);
4458 Py_DECREF(str);
4459 Py_DECREF(substr);
4460 return result;
4461}
4462
Guido van Rossumd57fd912000-03-10 22:53:23 +00004463/* Apply fixfct filter to the Unicode object self and return a
4464 reference to the modified object */
4465
Tim Petersced69f82003-09-16 20:30:58 +00004466static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004467PyObject *fixup(PyUnicodeObject *self,
4468 int (*fixfct)(PyUnicodeObject *s))
4469{
4470
4471 PyUnicodeObject *u;
4472
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004473 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004474 if (u == NULL)
4475 return NULL;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004476
4477 Py_UNICODE_COPY(u->str, self->str, self->length);
4478
Tim Peters7a29bd52001-09-12 03:03:31 +00004479 if (!fixfct(u) && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004480 /* fixfct should return TRUE if it modified the buffer. If
4481 FALSE, return a reference to the original buffer instead
4482 (to save space, not time) */
4483 Py_INCREF(self);
4484 Py_DECREF(u);
4485 return (PyObject*) self;
4486 }
4487 return (PyObject*) u;
4488}
4489
Tim Petersced69f82003-09-16 20:30:58 +00004490static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004491int fixupper(PyUnicodeObject *self)
4492{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004493 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004494 Py_UNICODE *s = self->str;
4495 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00004496
Guido van Rossumd57fd912000-03-10 22:53:23 +00004497 while (len-- > 0) {
4498 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00004499
Guido van Rossumd57fd912000-03-10 22:53:23 +00004500 ch = Py_UNICODE_TOUPPER(*s);
4501 if (ch != *s) {
4502 status = 1;
4503 *s = ch;
4504 }
4505 s++;
4506 }
4507
4508 return status;
4509}
4510
Tim Petersced69f82003-09-16 20:30:58 +00004511static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004512int fixlower(PyUnicodeObject *self)
4513{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004514 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004515 Py_UNICODE *s = self->str;
4516 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00004517
Guido van Rossumd57fd912000-03-10 22:53:23 +00004518 while (len-- > 0) {
4519 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00004520
Guido van Rossumd57fd912000-03-10 22:53:23 +00004521 ch = Py_UNICODE_TOLOWER(*s);
4522 if (ch != *s) {
4523 status = 1;
4524 *s = ch;
4525 }
4526 s++;
4527 }
4528
4529 return status;
4530}
4531
Tim Petersced69f82003-09-16 20:30:58 +00004532static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004533int fixswapcase(PyUnicodeObject *self)
4534{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004535 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004536 Py_UNICODE *s = self->str;
4537 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00004538
Guido van Rossumd57fd912000-03-10 22:53:23 +00004539 while (len-- > 0) {
4540 if (Py_UNICODE_ISUPPER(*s)) {
4541 *s = Py_UNICODE_TOLOWER(*s);
4542 status = 1;
4543 } else if (Py_UNICODE_ISLOWER(*s)) {
4544 *s = Py_UNICODE_TOUPPER(*s);
4545 status = 1;
4546 }
4547 s++;
4548 }
4549
4550 return status;
4551}
4552
Tim Petersced69f82003-09-16 20:30:58 +00004553static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004554int fixcapitalize(PyUnicodeObject *self)
4555{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004556 Py_ssize_t len = self->length;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00004557 Py_UNICODE *s = self->str;
4558 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00004559
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00004560 if (len == 0)
4561 return 0;
4562 if (Py_UNICODE_ISLOWER(*s)) {
4563 *s = Py_UNICODE_TOUPPER(*s);
4564 status = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004565 }
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00004566 s++;
4567 while (--len > 0) {
4568 if (Py_UNICODE_ISUPPER(*s)) {
4569 *s = Py_UNICODE_TOLOWER(*s);
4570 status = 1;
4571 }
4572 s++;
4573 }
4574 return status;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004575}
4576
4577static
4578int fixtitle(PyUnicodeObject *self)
4579{
4580 register Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
4581 register Py_UNICODE *e;
4582 int previous_is_cased;
4583
4584 /* Shortcut for single character strings */
4585 if (PyUnicode_GET_SIZE(self) == 1) {
4586 Py_UNICODE ch = Py_UNICODE_TOTITLE(*p);
4587 if (*p != ch) {
4588 *p = ch;
4589 return 1;
4590 }
4591 else
4592 return 0;
4593 }
Tim Petersced69f82003-09-16 20:30:58 +00004594
Guido van Rossumd57fd912000-03-10 22:53:23 +00004595 e = p + PyUnicode_GET_SIZE(self);
4596 previous_is_cased = 0;
4597 for (; p < e; p++) {
4598 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00004599
Guido van Rossumd57fd912000-03-10 22:53:23 +00004600 if (previous_is_cased)
4601 *p = Py_UNICODE_TOLOWER(ch);
4602 else
4603 *p = Py_UNICODE_TOTITLE(ch);
Tim Petersced69f82003-09-16 20:30:58 +00004604
4605 if (Py_UNICODE_ISLOWER(ch) ||
4606 Py_UNICODE_ISUPPER(ch) ||
Guido van Rossumd57fd912000-03-10 22:53:23 +00004607 Py_UNICODE_ISTITLE(ch))
4608 previous_is_cased = 1;
4609 else
4610 previous_is_cased = 0;
4611 }
4612 return 1;
4613}
4614
Tim Peters8ce9f162004-08-27 01:49:32 +00004615PyObject *
4616PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004617{
Tim Peters8ce9f162004-08-27 01:49:32 +00004618 PyObject *internal_separator = NULL;
Skip Montanaro6543b452004-09-16 03:28:13 +00004619 const Py_UNICODE blank = ' ';
4620 const Py_UNICODE *sep = &blank;
Thomas Wouters477c8d52006-05-27 19:21:47 +00004621 Py_ssize_t seplen = 1;
Tim Peters05eba1f2004-08-27 21:32:02 +00004622 PyUnicodeObject *res = NULL; /* the result */
Thomas Wouters477c8d52006-05-27 19:21:47 +00004623 Py_ssize_t res_alloc = 100; /* # allocated bytes for string in res */
4624 Py_ssize_t res_used; /* # used bytes */
Tim Peters05eba1f2004-08-27 21:32:02 +00004625 Py_UNICODE *res_p; /* pointer to free byte in res's string area */
4626 PyObject *fseq; /* PySequence_Fast(seq) */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004627 Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */
Tim Peters8ce9f162004-08-27 01:49:32 +00004628 PyObject *item;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004629 Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004630
Tim Peters05eba1f2004-08-27 21:32:02 +00004631 fseq = PySequence_Fast(seq, "");
4632 if (fseq == NULL) {
Tim Peters05eba1f2004-08-27 21:32:02 +00004633 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +00004634 }
4635
Tim Peters91879ab2004-08-27 22:35:44 +00004636 /* Grrrr. A codec may be invoked to convert str objects to
4637 * Unicode, and so it's possible to call back into Python code
4638 * during PyUnicode_FromObject(), and so it's possible for a sick
4639 * codec to change the size of fseq (if seq is a list). Therefore
4640 * we have to keep refetching the size -- can't assume seqlen
4641 * is invariant.
4642 */
Tim Peters05eba1f2004-08-27 21:32:02 +00004643 seqlen = PySequence_Fast_GET_SIZE(fseq);
4644 /* If empty sequence, return u"". */
4645 if (seqlen == 0) {
4646 res = _PyUnicode_New(0); /* empty sequence; return u"" */
4647 goto Done;
4648 }
4649 /* If singleton sequence with an exact Unicode, return that. */
4650 if (seqlen == 1) {
4651 item = PySequence_Fast_GET_ITEM(fseq, 0);
4652 if (PyUnicode_CheckExact(item)) {
4653 Py_INCREF(item);
4654 res = (PyUnicodeObject *)item;
4655 goto Done;
4656 }
Tim Peters8ce9f162004-08-27 01:49:32 +00004657 }
4658
Tim Peters05eba1f2004-08-27 21:32:02 +00004659 /* At least two items to join, or one that isn't exact Unicode. */
4660 if (seqlen > 1) {
4661 /* Set up sep and seplen -- they're needed. */
4662 if (separator == NULL) {
4663 sep = &blank;
4664 seplen = 1;
4665 }
4666 else {
4667 internal_separator = PyUnicode_FromObject(separator);
4668 if (internal_separator == NULL)
4669 goto onError;
4670 sep = PyUnicode_AS_UNICODE(internal_separator);
4671 seplen = PyUnicode_GET_SIZE(internal_separator);
Tim Peters91879ab2004-08-27 22:35:44 +00004672 /* In case PyUnicode_FromObject() mutated seq. */
4673 seqlen = PySequence_Fast_GET_SIZE(fseq);
Tim Peters05eba1f2004-08-27 21:32:02 +00004674 }
4675 }
4676
4677 /* Get space. */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004678 res = _PyUnicode_New(res_alloc);
Tim Peters05eba1f2004-08-27 21:32:02 +00004679 if (res == NULL)
Tim Peters8ce9f162004-08-27 01:49:32 +00004680 goto onError;
Tim Peters05eba1f2004-08-27 21:32:02 +00004681 res_p = PyUnicode_AS_UNICODE(res);
4682 res_used = 0;
Tim Peters8ce9f162004-08-27 01:49:32 +00004683
Tim Peters05eba1f2004-08-27 21:32:02 +00004684 for (i = 0; i < seqlen; ++i) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00004685 Py_ssize_t itemlen;
4686 Py_ssize_t new_res_used;
Tim Peters05eba1f2004-08-27 21:32:02 +00004687
4688 item = PySequence_Fast_GET_ITEM(fseq, i);
4689 /* Convert item to Unicode. */
4690 if (! PyUnicode_Check(item) && ! PyString_Check(item)) {
4691 PyErr_Format(PyExc_TypeError,
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004692 "sequence item %zd: expected string or Unicode,"
Tim Peters05eba1f2004-08-27 21:32:02 +00004693 " %.80s found",
4694 i, item->ob_type->tp_name);
Tim Peters2cfe3682001-05-05 05:36:48 +00004695 goto onError;
Tim Peters8ce9f162004-08-27 01:49:32 +00004696 }
Tim Peters05eba1f2004-08-27 21:32:02 +00004697 item = PyUnicode_FromObject(item);
4698 if (item == NULL)
4699 goto onError;
4700 /* We own a reference to item from here on. */
Tim Petersced69f82003-09-16 20:30:58 +00004701
Tim Peters91879ab2004-08-27 22:35:44 +00004702 /* In case PyUnicode_FromObject() mutated seq. */
4703 seqlen = PySequence_Fast_GET_SIZE(fseq);
4704
Tim Peters8ce9f162004-08-27 01:49:32 +00004705 /* Make sure we have enough space for the separator and the item. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004706 itemlen = PyUnicode_GET_SIZE(item);
Tim Peters05eba1f2004-08-27 21:32:02 +00004707 new_res_used = res_used + itemlen;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004708 if (new_res_used < 0)
Tim Peters8ce9f162004-08-27 01:49:32 +00004709 goto Overflow;
Tim Peters05eba1f2004-08-27 21:32:02 +00004710 if (i < seqlen - 1) {
4711 new_res_used += seplen;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004712 if (new_res_used < 0)
Tim Peters05eba1f2004-08-27 21:32:02 +00004713 goto Overflow;
4714 }
4715 if (new_res_used > res_alloc) {
4716 /* double allocated size until it's big enough */
Tim Peters8ce9f162004-08-27 01:49:32 +00004717 do {
Tim Peters05eba1f2004-08-27 21:32:02 +00004718 res_alloc += res_alloc;
Thomas Wouters477c8d52006-05-27 19:21:47 +00004719 if (res_alloc <= 0)
Tim Peters8ce9f162004-08-27 01:49:32 +00004720 goto Overflow;
Tim Peters05eba1f2004-08-27 21:32:02 +00004721 } while (new_res_used > res_alloc);
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004722 if (_PyUnicode_Resize(&res, res_alloc) < 0) {
Marc-André Lemburg3508e302001-09-20 17:22:58 +00004723 Py_DECREF(item);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004724 goto onError;
Marc-André Lemburg3508e302001-09-20 17:22:58 +00004725 }
Tim Peters05eba1f2004-08-27 21:32:02 +00004726 res_p = PyUnicode_AS_UNICODE(res) + res_used;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004727 }
Tim Peters05eba1f2004-08-27 21:32:02 +00004728
4729 /* Copy item, and maybe the separator. */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004730 Py_UNICODE_COPY(res_p, PyUnicode_AS_UNICODE(item), itemlen);
Tim Peters05eba1f2004-08-27 21:32:02 +00004731 res_p += itemlen;
4732 if (i < seqlen - 1) {
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004733 Py_UNICODE_COPY(res_p, sep, seplen);
Tim Peters05eba1f2004-08-27 21:32:02 +00004734 res_p += seplen;
4735 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004736 Py_DECREF(item);
Tim Peters05eba1f2004-08-27 21:32:02 +00004737 res_used = new_res_used;
4738 }
Tim Peters8ce9f162004-08-27 01:49:32 +00004739
Tim Peters05eba1f2004-08-27 21:32:02 +00004740 /* Shrink res to match the used area; this probably can't fail,
4741 * but it's cheap to check.
4742 */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004743 if (_PyUnicode_Resize(&res, res_used) < 0)
Tim Peters8ce9f162004-08-27 01:49:32 +00004744 goto onError;
4745
4746 Done:
4747 Py_XDECREF(internal_separator);
Tim Peters05eba1f2004-08-27 21:32:02 +00004748 Py_DECREF(fseq);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004749 return (PyObject *)res;
4750
Tim Peters8ce9f162004-08-27 01:49:32 +00004751 Overflow:
4752 PyErr_SetString(PyExc_OverflowError,
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004753 "join() result is too long for a Python string");
Tim Peters8ce9f162004-08-27 01:49:32 +00004754 Py_DECREF(item);
4755 /* fall through */
4756
Guido van Rossumd57fd912000-03-10 22:53:23 +00004757 onError:
Tim Peters8ce9f162004-08-27 01:49:32 +00004758 Py_XDECREF(internal_separator);
Tim Peters05eba1f2004-08-27 21:32:02 +00004759 Py_DECREF(fseq);
Tim Peters8ce9f162004-08-27 01:49:32 +00004760 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004761 return NULL;
4762}
4763
Tim Petersced69f82003-09-16 20:30:58 +00004764static
4765PyUnicodeObject *pad(PyUnicodeObject *self,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004766 Py_ssize_t left,
4767 Py_ssize_t right,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004768 Py_UNICODE fill)
4769{
4770 PyUnicodeObject *u;
4771
4772 if (left < 0)
4773 left = 0;
4774 if (right < 0)
4775 right = 0;
4776
Tim Peters7a29bd52001-09-12 03:03:31 +00004777 if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004778 Py_INCREF(self);
4779 return self;
4780 }
4781
4782 u = _PyUnicode_New(left + self->length + right);
4783 if (u) {
4784 if (left)
4785 Py_UNICODE_FILL(u->str, fill, left);
4786 Py_UNICODE_COPY(u->str + left, self->str, self->length);
4787 if (right)
4788 Py_UNICODE_FILL(u->str + left + self->length, fill, right);
4789 }
4790
4791 return u;
4792}
4793
4794#define SPLIT_APPEND(data, left, right) \
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004795 str = PyUnicode_FromUnicode((data) + (left), (right) - (left)); \
Guido van Rossumd57fd912000-03-10 22:53:23 +00004796 if (!str) \
4797 goto onError; \
4798 if (PyList_Append(list, str)) { \
4799 Py_DECREF(str); \
4800 goto onError; \
4801 } \
4802 else \
4803 Py_DECREF(str);
4804
4805static
4806PyObject *split_whitespace(PyUnicodeObject *self,
4807 PyObject *list,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004808 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004809{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004810 register Py_ssize_t i;
4811 register Py_ssize_t j;
4812 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004813 PyObject *str;
4814
4815 for (i = j = 0; i < len; ) {
4816 /* find a token */
4817 while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
4818 i++;
4819 j = i;
4820 while (i < len && !Py_UNICODE_ISSPACE(self->str[i]))
4821 i++;
4822 if (j < i) {
4823 if (maxcount-- <= 0)
4824 break;
4825 SPLIT_APPEND(self->str, j, i);
4826 while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
4827 i++;
4828 j = i;
4829 }
4830 }
4831 if (j < len) {
4832 SPLIT_APPEND(self->str, j, len);
4833 }
4834 return list;
4835
4836 onError:
4837 Py_DECREF(list);
4838 return NULL;
4839}
4840
4841PyObject *PyUnicode_Splitlines(PyObject *string,
Guido van Rossum86662912000-04-11 15:38:46 +00004842 int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004843{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004844 register Py_ssize_t i;
4845 register Py_ssize_t j;
4846 Py_ssize_t len;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004847 PyObject *list;
4848 PyObject *str;
4849 Py_UNICODE *data;
4850
4851 string = PyUnicode_FromObject(string);
4852 if (string == NULL)
4853 return NULL;
4854 data = PyUnicode_AS_UNICODE(string);
4855 len = PyUnicode_GET_SIZE(string);
4856
Guido van Rossumd57fd912000-03-10 22:53:23 +00004857 list = PyList_New(0);
4858 if (!list)
4859 goto onError;
4860
4861 for (i = j = 0; i < len; ) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00004862 Py_ssize_t eol;
Tim Petersced69f82003-09-16 20:30:58 +00004863
Guido van Rossumd57fd912000-03-10 22:53:23 +00004864 /* Find a line and append it */
Thomas Wouters477c8d52006-05-27 19:21:47 +00004865 while (i < len && !BLOOM_LINEBREAK(data[i]))
Guido van Rossumd57fd912000-03-10 22:53:23 +00004866 i++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004867
4868 /* Skip the line break reading CRLF as one line break */
Guido van Rossum86662912000-04-11 15:38:46 +00004869 eol = i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004870 if (i < len) {
4871 if (data[i] == '\r' && i + 1 < len &&
4872 data[i+1] == '\n')
4873 i += 2;
4874 else
4875 i++;
Guido van Rossum86662912000-04-11 15:38:46 +00004876 if (keepends)
4877 eol = i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004878 }
Guido van Rossum86662912000-04-11 15:38:46 +00004879 SPLIT_APPEND(data, j, eol);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004880 j = i;
4881 }
4882 if (j < len) {
4883 SPLIT_APPEND(data, j, len);
4884 }
4885
4886 Py_DECREF(string);
4887 return list;
4888
4889 onError:
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00004890 Py_XDECREF(list);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004891 Py_DECREF(string);
4892 return NULL;
4893}
4894
Tim Petersced69f82003-09-16 20:30:58 +00004895static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004896PyObject *split_char(PyUnicodeObject *self,
4897 PyObject *list,
4898 Py_UNICODE ch,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004899 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004900{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004901 register Py_ssize_t i;
4902 register Py_ssize_t j;
4903 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004904 PyObject *str;
4905
4906 for (i = j = 0; i < len; ) {
4907 if (self->str[i] == ch) {
4908 if (maxcount-- <= 0)
4909 break;
4910 SPLIT_APPEND(self->str, j, i);
4911 i = j = i + 1;
4912 } else
4913 i++;
4914 }
4915 if (j <= len) {
4916 SPLIT_APPEND(self->str, j, len);
4917 }
4918 return list;
4919
4920 onError:
4921 Py_DECREF(list);
4922 return NULL;
4923}
4924
Tim Petersced69f82003-09-16 20:30:58 +00004925static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004926PyObject *split_substring(PyUnicodeObject *self,
4927 PyObject *list,
4928 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004929 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004930{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004931 register Py_ssize_t i;
4932 register Py_ssize_t j;
4933 Py_ssize_t len = self->length;
4934 Py_ssize_t sublen = substring->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004935 PyObject *str;
4936
Guido van Rossumcda4f9a2000-12-19 02:23:19 +00004937 for (i = j = 0; i <= len - sublen; ) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004938 if (Py_UNICODE_MATCH(self, i, substring)) {
4939 if (maxcount-- <= 0)
4940 break;
4941 SPLIT_APPEND(self->str, j, i);
4942 i = j = i + sublen;
4943 } else
4944 i++;
4945 }
4946 if (j <= len) {
4947 SPLIT_APPEND(self->str, j, len);
4948 }
4949 return list;
4950
4951 onError:
4952 Py_DECREF(list);
4953 return NULL;
4954}
4955
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004956static
4957PyObject *rsplit_whitespace(PyUnicodeObject *self,
4958 PyObject *list,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004959 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004960{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004961 register Py_ssize_t i;
4962 register Py_ssize_t j;
4963 Py_ssize_t len = self->length;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004964 PyObject *str;
4965
4966 for (i = j = len - 1; i >= 0; ) {
4967 /* find a token */
4968 while (i >= 0 && Py_UNICODE_ISSPACE(self->str[i]))
4969 i--;
4970 j = i;
4971 while (i >= 0 && !Py_UNICODE_ISSPACE(self->str[i]))
4972 i--;
4973 if (j > i) {
4974 if (maxcount-- <= 0)
4975 break;
Thomas Wouters477c8d52006-05-27 19:21:47 +00004976 SPLIT_APPEND(self->str, i + 1, j + 1);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004977 while (i >= 0 && Py_UNICODE_ISSPACE(self->str[i]))
4978 i--;
4979 j = i;
4980 }
4981 }
4982 if (j >= 0) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00004983 SPLIT_APPEND(self->str, 0, j + 1);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004984 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00004985 if (PyList_Reverse(list) < 0)
4986 goto onError;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004987 return list;
4988
4989 onError:
4990 Py_DECREF(list);
4991 return NULL;
4992}
4993
4994static
4995PyObject *rsplit_char(PyUnicodeObject *self,
4996 PyObject *list,
4997 Py_UNICODE ch,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004998 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004999{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005000 register Py_ssize_t i;
5001 register Py_ssize_t j;
5002 Py_ssize_t len = self->length;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005003 PyObject *str;
5004
5005 for (i = j = len - 1; i >= 0; ) {
5006 if (self->str[i] == ch) {
5007 if (maxcount-- <= 0)
5008 break;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005009 SPLIT_APPEND(self->str, i + 1, j + 1);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005010 j = i = i - 1;
5011 } else
5012 i--;
5013 }
Hye-Shik Chang7fc4cf52003-12-23 09:10:16 +00005014 if (j >= -1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00005015 SPLIT_APPEND(self->str, 0, j + 1);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005016 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00005017 if (PyList_Reverse(list) < 0)
5018 goto onError;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005019 return list;
5020
5021 onError:
5022 Py_DECREF(list);
5023 return NULL;
5024}
5025
5026static
5027PyObject *rsplit_substring(PyUnicodeObject *self,
5028 PyObject *list,
5029 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005030 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005031{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005032 register Py_ssize_t i;
5033 register Py_ssize_t j;
5034 Py_ssize_t len = self->length;
5035 Py_ssize_t sublen = substring->length;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005036 PyObject *str;
5037
5038 for (i = len - sublen, j = len; i >= 0; ) {
5039 if (Py_UNICODE_MATCH(self, i, substring)) {
5040 if (maxcount-- <= 0)
5041 break;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005042 SPLIT_APPEND(self->str, i + sublen, j);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005043 j = i;
5044 i -= sublen;
5045 } else
5046 i--;
5047 }
5048 if (j >= 0) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00005049 SPLIT_APPEND(self->str, 0, j);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005050 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00005051 if (PyList_Reverse(list) < 0)
5052 goto onError;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005053 return list;
5054
5055 onError:
5056 Py_DECREF(list);
5057 return NULL;
5058}
5059
Guido van Rossumd57fd912000-03-10 22:53:23 +00005060#undef SPLIT_APPEND
5061
5062static
5063PyObject *split(PyUnicodeObject *self,
5064 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005065 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005066{
5067 PyObject *list;
5068
5069 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005070 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005071
5072 list = PyList_New(0);
5073 if (!list)
5074 return NULL;
5075
5076 if (substring == NULL)
5077 return split_whitespace(self,list,maxcount);
5078
5079 else if (substring->length == 1)
5080 return split_char(self,list,substring->str[0],maxcount);
5081
5082 else if (substring->length == 0) {
5083 Py_DECREF(list);
5084 PyErr_SetString(PyExc_ValueError, "empty separator");
5085 return NULL;
5086 }
5087 else
5088 return split_substring(self,list,substring,maxcount);
5089}
5090
Tim Petersced69f82003-09-16 20:30:58 +00005091static
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005092PyObject *rsplit(PyUnicodeObject *self,
5093 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005094 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005095{
5096 PyObject *list;
5097
5098 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005099 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005100
5101 list = PyList_New(0);
5102 if (!list)
5103 return NULL;
5104
5105 if (substring == NULL)
5106 return rsplit_whitespace(self,list,maxcount);
5107
5108 else if (substring->length == 1)
5109 return rsplit_char(self,list,substring->str[0],maxcount);
5110
5111 else if (substring->length == 0) {
5112 Py_DECREF(list);
5113 PyErr_SetString(PyExc_ValueError, "empty separator");
5114 return NULL;
5115 }
5116 else
5117 return rsplit_substring(self,list,substring,maxcount);
5118}
5119
5120static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005121PyObject *replace(PyUnicodeObject *self,
5122 PyUnicodeObject *str1,
5123 PyUnicodeObject *str2,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005124 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005125{
5126 PyUnicodeObject *u;
5127
5128 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005129 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005130
Thomas Wouters477c8d52006-05-27 19:21:47 +00005131 if (str1->length == str2->length) {
5132 /* same length */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005133 Py_ssize_t i;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005134 if (str1->length == 1) {
5135 /* replace characters */
5136 Py_UNICODE u1, u2;
5137 if (!findchar(self->str, self->length, str1->str[0]))
5138 goto nothing;
5139 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
5140 if (!u)
5141 return NULL;
5142 Py_UNICODE_COPY(u->str, self->str, self->length);
5143 u1 = str1->str[0];
5144 u2 = str2->str[0];
5145 for (i = 0; i < u->length; i++)
5146 if (u->str[i] == u1) {
5147 if (--maxcount < 0)
5148 break;
5149 u->str[i] = u2;
5150 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005151 } else {
Thomas Wouters477c8d52006-05-27 19:21:47 +00005152 i = fastsearch(
5153 self->str, self->length, str1->str, str1->length, FAST_SEARCH
Guido van Rossumd57fd912000-03-10 22:53:23 +00005154 );
Thomas Wouters477c8d52006-05-27 19:21:47 +00005155 if (i < 0)
5156 goto nothing;
5157 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
5158 if (!u)
5159 return NULL;
5160 Py_UNICODE_COPY(u->str, self->str, self->length);
5161 while (i <= self->length - str1->length)
5162 if (Py_UNICODE_MATCH(self, i, str1)) {
5163 if (--maxcount < 0)
5164 break;
5165 Py_UNICODE_COPY(u->str+i, str2->str, str2->length);
5166 i += str1->length;
5167 } else
5168 i++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005169 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005170 } else {
Thomas Wouters477c8d52006-05-27 19:21:47 +00005171
5172 Py_ssize_t n, i, j, e;
5173 Py_ssize_t product, new_size, delta;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005174 Py_UNICODE *p;
5175
5176 /* replace strings */
Thomas Wouters477c8d52006-05-27 19:21:47 +00005177 n = stringlib_count(self->str, self->length, str1->str, str1->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005178 if (n > maxcount)
5179 n = maxcount;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005180 if (n == 0)
5181 goto nothing;
5182 /* new_size = self->length + n * (str2->length - str1->length)); */
5183 delta = (str2->length - str1->length);
5184 if (delta == 0) {
5185 new_size = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005186 } else {
Thomas Wouters477c8d52006-05-27 19:21:47 +00005187 product = n * (str2->length - str1->length);
5188 if ((product / (str2->length - str1->length)) != n) {
5189 PyErr_SetString(PyExc_OverflowError,
5190 "replace string is too long");
5191 return NULL;
5192 }
5193 new_size = self->length + product;
5194 if (new_size < 0) {
5195 PyErr_SetString(PyExc_OverflowError,
5196 "replace string is too long");
5197 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005198 }
5199 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00005200 u = _PyUnicode_New(new_size);
5201 if (!u)
5202 return NULL;
5203 i = 0;
5204 p = u->str;
5205 e = self->length - str1->length;
5206 if (str1->length > 0) {
5207 while (n-- > 0) {
5208 /* look for next match */
5209 j = i;
5210 while (j <= e) {
5211 if (Py_UNICODE_MATCH(self, j, str1))
5212 break;
5213 j++;
5214 }
5215 if (j > i) {
5216 if (j > e)
5217 break;
5218 /* copy unchanged part [i:j] */
5219 Py_UNICODE_COPY(p, self->str+i, j-i);
5220 p += j - i;
5221 }
5222 /* copy substitution string */
5223 if (str2->length > 0) {
5224 Py_UNICODE_COPY(p, str2->str, str2->length);
5225 p += str2->length;
5226 }
5227 i = j + str1->length;
5228 }
5229 if (i < self->length)
5230 /* copy tail [i:] */
5231 Py_UNICODE_COPY(p, self->str+i, self->length-i);
5232 } else {
5233 /* interleave */
5234 while (n > 0) {
5235 Py_UNICODE_COPY(p, str2->str, str2->length);
5236 p += str2->length;
5237 if (--n <= 0)
5238 break;
5239 *p++ = self->str[i++];
5240 }
5241 Py_UNICODE_COPY(p, self->str+i, self->length-i);
5242 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005243 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005244 return (PyObject *) u;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005245
5246nothing:
5247 /* nothing to replace; return original string (when possible) */
5248 if (PyUnicode_CheckExact(self)) {
5249 Py_INCREF(self);
5250 return (PyObject *) self;
5251 }
5252 return PyUnicode_FromUnicode(self->str, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005253}
5254
5255/* --- Unicode Object Methods --------------------------------------------- */
5256
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005257PyDoc_STRVAR(title__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005258"S.title() -> unicode\n\
5259\n\
5260Return a titlecased version of S, i.e. words start with title case\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005261characters, all remaining cased characters have lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005262
5263static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005264unicode_title(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005265{
Guido van Rossumd57fd912000-03-10 22:53:23 +00005266 return fixup(self, fixtitle);
5267}
5268
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005269PyDoc_STRVAR(capitalize__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005270"S.capitalize() -> unicode\n\
5271\n\
5272Return a capitalized version of S, i.e. make the first character\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005273have upper case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005274
5275static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005276unicode_capitalize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005277{
Guido van Rossumd57fd912000-03-10 22:53:23 +00005278 return fixup(self, fixcapitalize);
5279}
5280
5281#if 0
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005282PyDoc_STRVAR(capwords__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005283"S.capwords() -> unicode\n\
5284\n\
5285Apply .capitalize() to all words in S and return the result with\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005286normalized whitespace (all whitespace strings are replaced by ' ').");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005287
5288static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005289unicode_capwords(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005290{
5291 PyObject *list;
5292 PyObject *item;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005293 Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005294
Guido van Rossumd57fd912000-03-10 22:53:23 +00005295 /* Split into words */
5296 list = split(self, NULL, -1);
5297 if (!list)
5298 return NULL;
5299
5300 /* Capitalize each word */
5301 for (i = 0; i < PyList_GET_SIZE(list); i++) {
5302 item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
5303 fixcapitalize);
5304 if (item == NULL)
5305 goto onError;
5306 Py_DECREF(PyList_GET_ITEM(list, i));
5307 PyList_SET_ITEM(list, i, item);
5308 }
5309
5310 /* Join the words to form a new string */
5311 item = PyUnicode_Join(NULL, list);
5312
5313onError:
5314 Py_DECREF(list);
5315 return (PyObject *)item;
5316}
5317#endif
5318
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00005319/* Argument converter. Coerces to a single unicode character */
5320
5321static int
5322convert_uc(PyObject *obj, void *addr)
5323{
5324 Py_UNICODE *fillcharloc = (Py_UNICODE *)addr;
5325 PyObject *uniobj;
5326 Py_UNICODE *unistr;
5327
5328 uniobj = PyUnicode_FromObject(obj);
5329 if (uniobj == NULL) {
5330 PyErr_SetString(PyExc_TypeError,
5331 "The fill character cannot be converted to Unicode");
5332 return 0;
5333 }
5334 if (PyUnicode_GET_SIZE(uniobj) != 1) {
5335 PyErr_SetString(PyExc_TypeError,
5336 "The fill character must be exactly one character long");
5337 Py_DECREF(uniobj);
5338 return 0;
5339 }
5340 unistr = PyUnicode_AS_UNICODE(uniobj);
5341 *fillcharloc = unistr[0];
5342 Py_DECREF(uniobj);
5343 return 1;
5344}
5345
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005346PyDoc_STRVAR(center__doc__,
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00005347"S.center(width[, fillchar]) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005348\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00005349Return S centered in a Unicode string of length width. Padding is\n\
5350done using the specified fill character (default is a space)");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005351
5352static PyObject *
5353unicode_center(PyUnicodeObject *self, PyObject *args)
5354{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005355 Py_ssize_t marg, left;
5356 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00005357 Py_UNICODE fillchar = ' ';
Guido van Rossumd57fd912000-03-10 22:53:23 +00005358
Thomas Woutersde017742006-02-16 19:34:37 +00005359 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005360 return NULL;
5361
Tim Peters7a29bd52001-09-12 03:03:31 +00005362 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005363 Py_INCREF(self);
5364 return (PyObject*) self;
5365 }
5366
5367 marg = width - self->length;
5368 left = marg / 2 + (marg & width & 1);
5369
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00005370 return (PyObject*) pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005371}
5372
Marc-André Lemburge5034372000-08-08 08:04:29 +00005373#if 0
5374
5375/* This code should go into some future Unicode collation support
5376 module. The basic comparison should compare ordinals on a naive
Trent Mick20abf572000-08-12 22:14:34 +00005377 basis (this is what Java does and thus JPython too). */
Marc-André Lemburge5034372000-08-08 08:04:29 +00005378
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005379/* speedy UTF-16 code point order comparison */
5380/* gleaned from: */
5381/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
5382
Marc-André Lemburge12896e2000-07-07 17:51:08 +00005383static short utf16Fixup[32] =
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005384{
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005385 0, 0, 0, 0, 0, 0, 0, 0,
Tim Petersced69f82003-09-16 20:30:58 +00005386 0, 0, 0, 0, 0, 0, 0, 0,
5387 0, 0, 0, 0, 0, 0, 0, 0,
Marc-André Lemburge12896e2000-07-07 17:51:08 +00005388 0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005389};
5390
Guido van Rossumd57fd912000-03-10 22:53:23 +00005391static int
5392unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
5393{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005394 Py_ssize_t len1, len2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005395
Guido van Rossumd57fd912000-03-10 22:53:23 +00005396 Py_UNICODE *s1 = str1->str;
5397 Py_UNICODE *s2 = str2->str;
5398
5399 len1 = str1->length;
5400 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00005401
Guido van Rossumd57fd912000-03-10 22:53:23 +00005402 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00005403 Py_UNICODE c1, c2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005404
5405 c1 = *s1++;
5406 c2 = *s2++;
Fredrik Lundh45714e92001-06-26 16:39:36 +00005407
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005408 if (c1 > (1<<11) * 26)
5409 c1 += utf16Fixup[c1>>11];
5410 if (c2 > (1<<11) * 26)
5411 c2 += utf16Fixup[c2>>11];
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005412 /* now c1 and c2 are in UTF-32-compatible order */
Fredrik Lundh45714e92001-06-26 16:39:36 +00005413
5414 if (c1 != c2)
5415 return (c1 < c2) ? -1 : 1;
Tim Petersced69f82003-09-16 20:30:58 +00005416
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005417 len1--; len2--;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005418 }
5419
5420 return (len1 < len2) ? -1 : (len1 != len2);
5421}
5422
Marc-André Lemburge5034372000-08-08 08:04:29 +00005423#else
5424
5425static int
5426unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
5427{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005428 register Py_ssize_t len1, len2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00005429
5430 Py_UNICODE *s1 = str1->str;
5431 Py_UNICODE *s2 = str2->str;
5432
5433 len1 = str1->length;
5434 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00005435
Marc-André Lemburge5034372000-08-08 08:04:29 +00005436 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00005437 Py_UNICODE c1, c2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00005438
Fredrik Lundh45714e92001-06-26 16:39:36 +00005439 c1 = *s1++;
5440 c2 = *s2++;
5441
5442 if (c1 != c2)
5443 return (c1 < c2) ? -1 : 1;
5444
Marc-André Lemburge5034372000-08-08 08:04:29 +00005445 len1--; len2--;
5446 }
5447
5448 return (len1 < len2) ? -1 : (len1 != len2);
5449}
5450
5451#endif
5452
Guido van Rossumd57fd912000-03-10 22:53:23 +00005453int PyUnicode_Compare(PyObject *left,
5454 PyObject *right)
5455{
Guido van Rossum09dc34f2007-05-04 04:17:33 +00005456 if (PyUnicode_Check(left) && PyUnicode_Check(right))
5457 return unicode_compare((PyUnicodeObject *)left,
5458 (PyUnicodeObject *)right);
5459 if ((PyString_Check(left) && PyUnicode_Check(right)) ||
5460 (PyUnicode_Check(left) && PyString_Check(right))) {
5461 if (PyUnicode_Check(left))
5462 left = _PyUnicode_AsDefaultEncodedString(left, NULL);
5463 if (PyUnicode_Check(right))
5464 right = _PyUnicode_AsDefaultEncodedString(right, NULL);
5465 assert(PyString_Check(left));
5466 assert(PyString_Check(right));
5467 return PyObject_Compare(left, right);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005468 }
Guido van Rossum09dc34f2007-05-04 04:17:33 +00005469 PyErr_Format(PyExc_TypeError,
5470 "Can't compare %.100s and %.100s",
5471 left->ob_type->tp_name,
5472 right->ob_type->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005473 return -1;
5474}
5475
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00005476PyObject *PyUnicode_RichCompare(PyObject *left,
5477 PyObject *right,
5478 int op)
5479{
5480 int result;
5481
5482 result = PyUnicode_Compare(left, right);
5483 if (result == -1 && PyErr_Occurred())
5484 goto onError;
5485
5486 /* Convert the return value to a Boolean */
5487 switch (op) {
5488 case Py_EQ:
5489 result = (result == 0);
5490 break;
5491 case Py_NE:
5492 result = (result != 0);
5493 break;
5494 case Py_LE:
5495 result = (result <= 0);
5496 break;
5497 case Py_GE:
5498 result = (result >= 0);
5499 break;
5500 case Py_LT:
5501 result = (result == -1);
5502 break;
5503 case Py_GT:
5504 result = (result == 1);
5505 break;
5506 }
5507 return PyBool_FromLong(result);
5508
5509 onError:
5510
5511 /* Standard case
5512
5513 Type errors mean that PyUnicode_FromObject() could not convert
5514 one of the arguments (usually the right hand side) to Unicode,
5515 ie. we can't handle the comparison request. However, it is
5516 possible that the other object knows a comparison method, which
5517 is why we return Py_NotImplemented to give the other object a
5518 chance.
5519
5520 */
5521 if (PyErr_ExceptionMatches(PyExc_TypeError)) {
5522 PyErr_Clear();
5523 Py_INCREF(Py_NotImplemented);
5524 return Py_NotImplemented;
5525 }
5526 if (op != Py_EQ && op != Py_NE)
5527 return NULL;
5528
5529 /* Equality comparison.
5530
5531 This is a special case: we silence any PyExc_UnicodeDecodeError
5532 and instead turn it into a PyErr_UnicodeWarning.
5533
5534 */
5535 if (!PyErr_ExceptionMatches(PyExc_UnicodeDecodeError))
5536 return NULL;
5537 PyErr_Clear();
5538 if (PyErr_Warn(PyExc_UnicodeWarning,
5539 (op == Py_EQ) ?
5540 "Unicode equal comparison "
5541 "failed to convert both arguments to Unicode - "
5542 "interpreting them as being unequal" :
5543 "Unicode unequal comparison "
5544 "failed to convert both arguments to Unicode - "
5545 "interpreting them as being unequal"
5546 ) < 0)
5547 return NULL;
5548 result = (op == Py_NE);
5549 return PyBool_FromLong(result);
5550}
5551
Guido van Rossum403d68b2000-03-13 15:55:09 +00005552int PyUnicode_Contains(PyObject *container,
5553 PyObject *element)
5554{
Thomas Wouters477c8d52006-05-27 19:21:47 +00005555 PyObject *str, *sub;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005556 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00005557
5558 /* Coerce the two arguments */
Thomas Wouters477c8d52006-05-27 19:21:47 +00005559 sub = PyUnicode_FromObject(element);
5560 if (!sub) {
Marc-André Lemburg7c014682000-06-28 08:11:47 +00005561 PyErr_SetString(PyExc_TypeError,
Barry Warsaw817918c2002-08-06 16:58:21 +00005562 "'in <string>' requires string as left operand");
Thomas Wouters477c8d52006-05-27 19:21:47 +00005563 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +00005564 }
5565
Thomas Wouters477c8d52006-05-27 19:21:47 +00005566 str = PyUnicode_FromObject(container);
5567 if (!str) {
5568 Py_DECREF(sub);
5569 return -1;
5570 }
5571
5572 result = stringlib_contains_obj(str, sub);
5573
5574 Py_DECREF(str);
5575 Py_DECREF(sub);
5576
Guido van Rossum403d68b2000-03-13 15:55:09 +00005577 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00005578}
5579
Guido van Rossumd57fd912000-03-10 22:53:23 +00005580/* Concat to string or Unicode object giving a new Unicode object. */
5581
5582PyObject *PyUnicode_Concat(PyObject *left,
5583 PyObject *right)
5584{
5585 PyUnicodeObject *u = NULL, *v = NULL, *w;
5586
Guido van Rossum84d79dd2007-04-13 02:23:57 +00005587 if (PyBytes_Check(left) || PyBytes_Check(right))
5588 return PyBytes_Concat(left, right);
5589
Guido van Rossumd57fd912000-03-10 22:53:23 +00005590 /* Coerce the two arguments */
5591 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
5592 if (u == NULL)
5593 goto onError;
5594 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
5595 if (v == NULL)
5596 goto onError;
5597
5598 /* Shortcuts */
5599 if (v == unicode_empty) {
5600 Py_DECREF(v);
5601 return (PyObject *)u;
5602 }
5603 if (u == unicode_empty) {
5604 Py_DECREF(u);
5605 return (PyObject *)v;
5606 }
5607
5608 /* Concat the two Unicode strings */
5609 w = _PyUnicode_New(u->length + v->length);
5610 if (w == NULL)
5611 goto onError;
5612 Py_UNICODE_COPY(w->str, u->str, u->length);
5613 Py_UNICODE_COPY(w->str + u->length, v->str, v->length);
5614
5615 Py_DECREF(u);
5616 Py_DECREF(v);
5617 return (PyObject *)w;
5618
5619onError:
5620 Py_XDECREF(u);
5621 Py_XDECREF(v);
5622 return NULL;
5623}
5624
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005625PyDoc_STRVAR(count__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005626"S.count(sub[, start[, end]]) -> int\n\
5627\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00005628Return the number of non-overlapping occurrences of substring sub in\n\
5629Unicode string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005630interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005631
5632static PyObject *
5633unicode_count(PyUnicodeObject *self, PyObject *args)
5634{
5635 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005636 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005637 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005638 PyObject *result;
5639
Guido van Rossumb8872e62000-05-09 14:14:27 +00005640 if (!PyArg_ParseTuple(args, "O|O&O&:count", &substring,
5641 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005642 return NULL;
5643
5644 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Thomas Wouters477c8d52006-05-27 19:21:47 +00005645 (PyObject *)substring);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005646 if (substring == NULL)
5647 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +00005648
Thomas Wouters477c8d52006-05-27 19:21:47 +00005649 FIX_START_END(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005650
Thomas Wouters477c8d52006-05-27 19:21:47 +00005651 result = PyInt_FromSsize_t(
5652 stringlib_count(self->str + start, end - start,
5653 substring->str, substring->length)
5654 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00005655
5656 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00005657
Guido van Rossumd57fd912000-03-10 22:53:23 +00005658 return result;
5659}
5660
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005661PyDoc_STRVAR(encode__doc__,
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00005662"S.encode([encoding[,errors]]) -> string or unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005663\n\
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00005664Encodes S using the codec registered for encoding. encoding defaults\n\
5665to the default encoding. errors may be given to set a different error\n\
Fred Drakee4315f52000-05-09 19:53:39 +00005666handling scheme. Default is 'strict' meaning that encoding errors raise\n\
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005667a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
5668'xmlcharrefreplace' as well as any other name registered with\n\
5669codecs.register_error that can handle UnicodeEncodeErrors.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005670
5671static PyObject *
5672unicode_encode(PyUnicodeObject *self, PyObject *args)
5673{
5674 char *encoding = NULL;
5675 char *errors = NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00005676 PyObject *v;
5677
Guido van Rossumd57fd912000-03-10 22:53:23 +00005678 if (!PyArg_ParseTuple(args, "|ss:encode", &encoding, &errors))
5679 return NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00005680 v = PyUnicode_AsEncodedObject((PyObject *)self, encoding, errors);
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00005681 if (v == NULL)
5682 goto onError;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00005683 if (!PyBytes_Check(v)) {
Guido van Rossum4355a472007-05-04 05:00:04 +00005684 if (PyString_Check(v)) {
5685 /* Old codec, turn it into bytes */
5686 PyObject *b = PyBytes_FromObject(v);
5687 Py_DECREF(v);
5688 return b;
5689 }
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00005690 PyErr_Format(PyExc_TypeError,
Guido van Rossumf15a29f2007-05-04 00:41:39 +00005691 "encoder did not return a bytes object "
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00005692 "(type=%.400s)",
5693 v->ob_type->tp_name);
5694 Py_DECREF(v);
5695 return NULL;
5696 }
5697 return v;
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00005698
5699 onError:
5700 return NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00005701}
5702
5703PyDoc_STRVAR(decode__doc__,
5704"S.decode([encoding[,errors]]) -> string or unicode\n\
5705\n\
5706Decodes S using the codec registered for encoding. encoding defaults\n\
5707to the default encoding. errors may be given to set a different error\n\
5708handling scheme. Default is 'strict' meaning that encoding errors raise\n\
5709a UnicodeDecodeError. Other possible values are 'ignore' and 'replace'\n\
5710as well as any other name registerd with codecs.register_error that is\n\
5711able to handle UnicodeDecodeErrors.");
5712
5713static PyObject *
Marc-André Lemburg126b44c2004-07-10 12:04:20 +00005714unicode_decode(PyUnicodeObject *self, PyObject *args)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00005715{
5716 char *encoding = NULL;
5717 char *errors = NULL;
5718 PyObject *v;
5719
5720 if (!PyArg_ParseTuple(args, "|ss:decode", &encoding, &errors))
5721 return NULL;
5722 v = PyUnicode_AsDecodedObject((PyObject *)self, encoding, errors);
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00005723 if (v == NULL)
5724 goto onError;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00005725 if (!PyString_Check(v) && !PyUnicode_Check(v)) {
5726 PyErr_Format(PyExc_TypeError,
5727 "decoder did not return a string/unicode object "
5728 "(type=%.400s)",
5729 v->ob_type->tp_name);
5730 Py_DECREF(v);
5731 return NULL;
5732 }
5733 return v;
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00005734
5735 onError:
5736 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005737}
5738
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005739PyDoc_STRVAR(expandtabs__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005740"S.expandtabs([tabsize]) -> unicode\n\
5741\n\
5742Return a copy of S where all tab characters are expanded using spaces.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005743If tabsize is not given, a tab size of 8 characters is assumed.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005744
5745static PyObject*
5746unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
5747{
5748 Py_UNICODE *e;
5749 Py_UNICODE *p;
5750 Py_UNICODE *q;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005751 Py_ssize_t i, j;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005752 PyUnicodeObject *u;
5753 int tabsize = 8;
5754
5755 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
5756 return NULL;
5757
Thomas Wouters7e474022000-07-16 12:04:32 +00005758 /* First pass: determine size of output string */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005759 i = j = 0;
5760 e = self->str + self->length;
5761 for (p = self->str; p < e; p++)
5762 if (*p == '\t') {
5763 if (tabsize > 0)
5764 j += tabsize - (j % tabsize);
5765 }
5766 else {
5767 j++;
5768 if (*p == '\n' || *p == '\r') {
5769 i += j;
5770 j = 0;
5771 }
5772 }
5773
5774 /* Second pass: create output string and fill it */
5775 u = _PyUnicode_New(i + j);
5776 if (!u)
5777 return NULL;
5778
5779 j = 0;
5780 q = u->str;
5781
5782 for (p = self->str; p < e; p++)
5783 if (*p == '\t') {
5784 if (tabsize > 0) {
5785 i = tabsize - (j % tabsize);
5786 j += i;
5787 while (i--)
5788 *q++ = ' ';
5789 }
5790 }
5791 else {
5792 j++;
5793 *q++ = *p;
5794 if (*p == '\n' || *p == '\r')
5795 j = 0;
5796 }
5797
5798 return (PyObject*) u;
5799}
5800
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005801PyDoc_STRVAR(find__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005802"S.find(sub [,start [,end]]) -> int\n\
5803\n\
5804Return the lowest index in S where substring sub is found,\n\
5805such that sub is contained within s[start,end]. Optional\n\
5806arguments start and end are interpreted as in slice notation.\n\
5807\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005808Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005809
5810static PyObject *
5811unicode_find(PyUnicodeObject *self, PyObject *args)
5812{
Thomas Wouters477c8d52006-05-27 19:21:47 +00005813 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005814 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005815 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005816 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005817
Guido van Rossumb8872e62000-05-09 14:14:27 +00005818 if (!PyArg_ParseTuple(args, "O|O&O&:find", &substring,
5819 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005820 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005821 substring = PyUnicode_FromObject(substring);
5822 if (!substring)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005823 return NULL;
5824
Thomas Wouters477c8d52006-05-27 19:21:47 +00005825 result = stringlib_find_slice(
5826 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
5827 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
5828 start, end
5829 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00005830
5831 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00005832
5833 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005834}
5835
5836static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +00005837unicode_getitem(PyUnicodeObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005838{
5839 if (index < 0 || index >= self->length) {
5840 PyErr_SetString(PyExc_IndexError, "string index out of range");
5841 return NULL;
5842 }
5843
5844 return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1);
5845}
5846
5847static long
Guido van Rossum09dc34f2007-05-04 04:17:33 +00005848unicode_hash(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005849{
Guido van Rossum09dc34f2007-05-04 04:17:33 +00005850 /* Since Unicode objects compare equal to their UTF-8 string
5851 counterparts, we hash the UTF-8 string. */
5852 PyObject *v = _PyUnicode_AsDefaultEncodedString(self, NULL);
5853 return PyObject_Hash(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005854}
5855
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005856PyDoc_STRVAR(index__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005857"S.index(sub [,start [,end]]) -> int\n\
5858\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005859Like S.find() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005860
5861static PyObject *
5862unicode_index(PyUnicodeObject *self, PyObject *args)
5863{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005864 Py_ssize_t result;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005865 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005866 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005867 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005868
Guido van Rossumb8872e62000-05-09 14:14:27 +00005869 if (!PyArg_ParseTuple(args, "O|O&O&:index", &substring,
5870 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005871 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005872 substring = PyUnicode_FromObject(substring);
5873 if (!substring)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005874 return NULL;
5875
Thomas Wouters477c8d52006-05-27 19:21:47 +00005876 result = stringlib_find_slice(
5877 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
5878 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
5879 start, end
5880 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00005881
5882 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00005883
Guido van Rossumd57fd912000-03-10 22:53:23 +00005884 if (result < 0) {
5885 PyErr_SetString(PyExc_ValueError, "substring not found");
5886 return NULL;
5887 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00005888
Martin v. Löwis18e16552006-02-15 17:27:45 +00005889 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005890}
5891
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005892PyDoc_STRVAR(islower__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005893"S.islower() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005894\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00005895Return True if all cased characters in S are lowercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005896at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005897
5898static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005899unicode_islower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005900{
5901 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5902 register const Py_UNICODE *e;
5903 int cased;
5904
Guido van Rossumd57fd912000-03-10 22:53:23 +00005905 /* Shortcut for single character strings */
5906 if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005907 return PyBool_FromLong(Py_UNICODE_ISLOWER(*p));
Guido van Rossumd57fd912000-03-10 22:53:23 +00005908
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005909 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00005910 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005911 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005912
Guido van Rossumd57fd912000-03-10 22:53:23 +00005913 e = p + PyUnicode_GET_SIZE(self);
5914 cased = 0;
5915 for (; p < e; p++) {
5916 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00005917
Guido van Rossumd57fd912000-03-10 22:53:23 +00005918 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005919 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005920 else if (!cased && Py_UNICODE_ISLOWER(ch))
5921 cased = 1;
5922 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00005923 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005924}
5925
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005926PyDoc_STRVAR(isupper__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005927"S.isupper() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005928\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00005929Return True if all cased characters in S are uppercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005930at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005931
5932static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005933unicode_isupper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005934{
5935 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5936 register const Py_UNICODE *e;
5937 int cased;
5938
Guido van Rossumd57fd912000-03-10 22:53:23 +00005939 /* Shortcut for single character strings */
5940 if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005941 return PyBool_FromLong(Py_UNICODE_ISUPPER(*p) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005942
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005943 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00005944 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005945 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005946
Guido van Rossumd57fd912000-03-10 22:53:23 +00005947 e = p + PyUnicode_GET_SIZE(self);
5948 cased = 0;
5949 for (; p < e; p++) {
5950 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00005951
Guido van Rossumd57fd912000-03-10 22:53:23 +00005952 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005953 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005954 else if (!cased && Py_UNICODE_ISUPPER(ch))
5955 cased = 1;
5956 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00005957 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005958}
5959
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005960PyDoc_STRVAR(istitle__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005961"S.istitle() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005962\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00005963Return True if S is a titlecased string and there is at least one\n\
5964character in S, i.e. upper- and titlecase characters may only\n\
5965follow uncased characters and lowercase characters only cased ones.\n\
5966Return False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005967
5968static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005969unicode_istitle(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005970{
5971 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5972 register const Py_UNICODE *e;
5973 int cased, previous_is_cased;
5974
Guido van Rossumd57fd912000-03-10 22:53:23 +00005975 /* Shortcut for single character strings */
5976 if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005977 return PyBool_FromLong((Py_UNICODE_ISTITLE(*p) != 0) ||
5978 (Py_UNICODE_ISUPPER(*p) != 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +00005979
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005980 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00005981 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005982 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005983
Guido van Rossumd57fd912000-03-10 22:53:23 +00005984 e = p + PyUnicode_GET_SIZE(self);
5985 cased = 0;
5986 previous_is_cased = 0;
5987 for (; p < e; p++) {
5988 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00005989
Guido van Rossumd57fd912000-03-10 22:53:23 +00005990 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
5991 if (previous_is_cased)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005992 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005993 previous_is_cased = 1;
5994 cased = 1;
5995 }
5996 else if (Py_UNICODE_ISLOWER(ch)) {
5997 if (!previous_is_cased)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005998 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005999 previous_is_cased = 1;
6000 cased = 1;
6001 }
6002 else
6003 previous_is_cased = 0;
6004 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006005 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006006}
6007
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006008PyDoc_STRVAR(isspace__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006009"S.isspace() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006010\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006011Return True if all characters in S are whitespace\n\
6012and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006013
6014static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006015unicode_isspace(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006016{
6017 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6018 register const Py_UNICODE *e;
6019
Guido van Rossumd57fd912000-03-10 22:53:23 +00006020 /* Shortcut for single character strings */
6021 if (PyUnicode_GET_SIZE(self) == 1 &&
6022 Py_UNICODE_ISSPACE(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006023 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006024
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006025 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006026 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006027 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006028
Guido van Rossumd57fd912000-03-10 22:53:23 +00006029 e = p + PyUnicode_GET_SIZE(self);
6030 for (; p < e; p++) {
6031 if (!Py_UNICODE_ISSPACE(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006032 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006033 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006034 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006035}
6036
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006037PyDoc_STRVAR(isalpha__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006038"S.isalpha() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006039\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006040Return True if all characters in S are alphabetic\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006041and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006042
6043static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006044unicode_isalpha(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006045{
6046 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6047 register const Py_UNICODE *e;
6048
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006049 /* Shortcut for single character strings */
6050 if (PyUnicode_GET_SIZE(self) == 1 &&
6051 Py_UNICODE_ISALPHA(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006052 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006053
6054 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006055 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006056 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006057
6058 e = p + PyUnicode_GET_SIZE(self);
6059 for (; p < e; p++) {
6060 if (!Py_UNICODE_ISALPHA(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006061 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006062 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006063 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006064}
6065
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006066PyDoc_STRVAR(isalnum__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006067"S.isalnum() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006068\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006069Return True if all characters in S are alphanumeric\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006070and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006071
6072static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006073unicode_isalnum(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006074{
6075 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6076 register const Py_UNICODE *e;
6077
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006078 /* Shortcut for single character strings */
6079 if (PyUnicode_GET_SIZE(self) == 1 &&
6080 Py_UNICODE_ISALNUM(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006081 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006082
6083 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006084 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006085 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006086
6087 e = p + PyUnicode_GET_SIZE(self);
6088 for (; p < e; p++) {
6089 if (!Py_UNICODE_ISALNUM(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006090 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006091 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006092 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006093}
6094
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006095PyDoc_STRVAR(isdecimal__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006096"S.isdecimal() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006097\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00006098Return True if there are only decimal characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006099False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006100
6101static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006102unicode_isdecimal(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006103{
6104 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6105 register const Py_UNICODE *e;
6106
Guido van Rossumd57fd912000-03-10 22:53:23 +00006107 /* Shortcut for single character strings */
6108 if (PyUnicode_GET_SIZE(self) == 1 &&
6109 Py_UNICODE_ISDECIMAL(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006110 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006111
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006112 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006113 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006114 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006115
Guido van Rossumd57fd912000-03-10 22:53:23 +00006116 e = p + PyUnicode_GET_SIZE(self);
6117 for (; p < e; p++) {
6118 if (!Py_UNICODE_ISDECIMAL(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006119 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006120 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006121 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006122}
6123
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006124PyDoc_STRVAR(isdigit__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006125"S.isdigit() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006126\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006127Return True if all characters in S are digits\n\
6128and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006129
6130static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006131unicode_isdigit(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006132{
6133 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6134 register const Py_UNICODE *e;
6135
Guido van Rossumd57fd912000-03-10 22:53:23 +00006136 /* Shortcut for single character strings */
6137 if (PyUnicode_GET_SIZE(self) == 1 &&
6138 Py_UNICODE_ISDIGIT(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006139 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006140
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006141 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006142 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006143 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006144
Guido van Rossumd57fd912000-03-10 22:53:23 +00006145 e = p + PyUnicode_GET_SIZE(self);
6146 for (; p < e; p++) {
6147 if (!Py_UNICODE_ISDIGIT(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006148 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006149 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006150 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006151}
6152
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006153PyDoc_STRVAR(isnumeric__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006154"S.isnumeric() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006155\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00006156Return True if there are only numeric characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006157False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006158
6159static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006160unicode_isnumeric(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006161{
6162 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6163 register const Py_UNICODE *e;
6164
Guido van Rossumd57fd912000-03-10 22:53:23 +00006165 /* Shortcut for single character strings */
6166 if (PyUnicode_GET_SIZE(self) == 1 &&
6167 Py_UNICODE_ISNUMERIC(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006168 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006169
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006170 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006171 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006172 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006173
Guido van Rossumd57fd912000-03-10 22:53:23 +00006174 e = p + PyUnicode_GET_SIZE(self);
6175 for (; p < e; p++) {
6176 if (!Py_UNICODE_ISNUMERIC(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006177 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006178 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006179 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006180}
6181
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006182PyDoc_STRVAR(join__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006183"S.join(sequence) -> unicode\n\
6184\n\
6185Return a string which is the concatenation of the strings in the\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006186sequence. The separator between elements is S.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006187
6188static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006189unicode_join(PyObject *self, PyObject *data)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006190{
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006191 return PyUnicode_Join(self, data);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006192}
6193
Martin v. Löwis18e16552006-02-15 17:27:45 +00006194static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +00006195unicode_length(PyUnicodeObject *self)
6196{
6197 return self->length;
6198}
6199
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006200PyDoc_STRVAR(ljust__doc__,
Hye-Shik Chang974ed7c2004-06-02 16:49:17 +00006201"S.ljust(width[, fillchar]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006202\n\
6203Return S left justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006204done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006205
6206static PyObject *
6207unicode_ljust(PyUnicodeObject *self, PyObject *args)
6208{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006209 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006210 Py_UNICODE fillchar = ' ';
6211
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006212 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006213 return NULL;
6214
Tim Peters7a29bd52001-09-12 03:03:31 +00006215 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006216 Py_INCREF(self);
6217 return (PyObject*) self;
6218 }
6219
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006220 return (PyObject*) pad(self, 0, width - self->length, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006221}
6222
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006223PyDoc_STRVAR(lower__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006224"S.lower() -> unicode\n\
6225\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006226Return a copy of the string S converted to lowercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006227
6228static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006229unicode_lower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006230{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006231 return fixup(self, fixlower);
6232}
6233
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006234#define LEFTSTRIP 0
6235#define RIGHTSTRIP 1
6236#define BOTHSTRIP 2
6237
6238/* Arrays indexed by above */
6239static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
6240
6241#define STRIPNAME(i) (stripformat[i]+3)
6242
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006243/* externally visible for str.strip(unicode) */
6244PyObject *
6245_PyUnicode_XStrip(PyUnicodeObject *self, int striptype, PyObject *sepobj)
6246{
6247 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
Martin v. Löwis18e16552006-02-15 17:27:45 +00006248 Py_ssize_t len = PyUnicode_GET_SIZE(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006249 Py_UNICODE *sep = PyUnicode_AS_UNICODE(sepobj);
Martin v. Löwis18e16552006-02-15 17:27:45 +00006250 Py_ssize_t seplen = PyUnicode_GET_SIZE(sepobj);
6251 Py_ssize_t i, j;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006252
Thomas Wouters477c8d52006-05-27 19:21:47 +00006253 BLOOM_MASK sepmask = make_bloom_mask(sep, seplen);
6254
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006255 i = 0;
6256 if (striptype != RIGHTSTRIP) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00006257 while (i < len && BLOOM_MEMBER(sepmask, s[i], sep, seplen)) {
6258 i++;
6259 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006260 }
6261
6262 j = len;
6263 if (striptype != LEFTSTRIP) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00006264 do {
6265 j--;
6266 } while (j >= i && BLOOM_MEMBER(sepmask, s[j], sep, seplen));
6267 j++;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006268 }
6269
6270 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00006271 Py_INCREF(self);
6272 return (PyObject*)self;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006273 }
6274 else
Thomas Wouters477c8d52006-05-27 19:21:47 +00006275 return PyUnicode_FromUnicode(s+i, j-i);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006276}
6277
Guido van Rossumd57fd912000-03-10 22:53:23 +00006278
6279static PyObject *
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006280do_strip(PyUnicodeObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006281{
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006282 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
Martin v. Löwis18e16552006-02-15 17:27:45 +00006283 Py_ssize_t len = PyUnicode_GET_SIZE(self), i, j;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006284
6285 i = 0;
6286 if (striptype != RIGHTSTRIP) {
6287 while (i < len && Py_UNICODE_ISSPACE(s[i])) {
6288 i++;
6289 }
6290 }
6291
6292 j = len;
6293 if (striptype != LEFTSTRIP) {
6294 do {
6295 j--;
6296 } while (j >= i && Py_UNICODE_ISSPACE(s[j]));
6297 j++;
6298 }
6299
6300 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
6301 Py_INCREF(self);
6302 return (PyObject*)self;
6303 }
6304 else
6305 return PyUnicode_FromUnicode(s+i, j-i);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006306}
6307
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006308
6309static PyObject *
6310do_argstrip(PyUnicodeObject *self, int striptype, PyObject *args)
6311{
6312 PyObject *sep = NULL;
6313
6314 if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
6315 return NULL;
6316
6317 if (sep != NULL && sep != Py_None) {
6318 if (PyUnicode_Check(sep))
6319 return _PyUnicode_XStrip(self, striptype, sep);
6320 else if (PyString_Check(sep)) {
6321 PyObject *res;
6322 sep = PyUnicode_FromObject(sep);
6323 if (sep==NULL)
6324 return NULL;
6325 res = _PyUnicode_XStrip(self, striptype, sep);
6326 Py_DECREF(sep);
6327 return res;
6328 }
6329 else {
6330 PyErr_Format(PyExc_TypeError,
6331 "%s arg must be None, unicode or str",
6332 STRIPNAME(striptype));
6333 return NULL;
6334 }
6335 }
6336
6337 return do_strip(self, striptype);
6338}
6339
6340
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006341PyDoc_STRVAR(strip__doc__,
Neal Norwitzffe33b72003-04-10 22:35:32 +00006342"S.strip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006343\n\
6344Return a copy of the string S with leading and trailing\n\
6345whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00006346If chars is given and not None, remove characters in chars instead.\n\
6347If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006348
6349static PyObject *
6350unicode_strip(PyUnicodeObject *self, PyObject *args)
6351{
6352 if (PyTuple_GET_SIZE(args) == 0)
6353 return do_strip(self, BOTHSTRIP); /* Common case */
6354 else
6355 return do_argstrip(self, BOTHSTRIP, args);
6356}
6357
6358
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006359PyDoc_STRVAR(lstrip__doc__,
Neal Norwitzffe33b72003-04-10 22:35:32 +00006360"S.lstrip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006361\n\
6362Return a copy of the string S with leading whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00006363If chars is given and not None, remove characters in chars instead.\n\
6364If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006365
6366static PyObject *
6367unicode_lstrip(PyUnicodeObject *self, PyObject *args)
6368{
6369 if (PyTuple_GET_SIZE(args) == 0)
6370 return do_strip(self, LEFTSTRIP); /* Common case */
6371 else
6372 return do_argstrip(self, LEFTSTRIP, args);
6373}
6374
6375
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006376PyDoc_STRVAR(rstrip__doc__,
Neal Norwitzffe33b72003-04-10 22:35:32 +00006377"S.rstrip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006378\n\
6379Return a copy of the string S with trailing whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00006380If chars is given and not None, remove characters in chars instead.\n\
6381If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006382
6383static PyObject *
6384unicode_rstrip(PyUnicodeObject *self, PyObject *args)
6385{
6386 if (PyTuple_GET_SIZE(args) == 0)
6387 return do_strip(self, RIGHTSTRIP); /* Common case */
6388 else
6389 return do_argstrip(self, RIGHTSTRIP, args);
6390}
6391
6392
Guido van Rossumd57fd912000-03-10 22:53:23 +00006393static PyObject*
Martin v. Löwis18e16552006-02-15 17:27:45 +00006394unicode_repeat(PyUnicodeObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006395{
6396 PyUnicodeObject *u;
6397 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006398 Py_ssize_t nchars;
Tim Peters8f422462000-09-09 06:13:41 +00006399 size_t nbytes;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006400
6401 if (len < 0)
6402 len = 0;
6403
Tim Peters7a29bd52001-09-12 03:03:31 +00006404 if (len == 1 && PyUnicode_CheckExact(str)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006405 /* no repeat, return original string */
6406 Py_INCREF(str);
6407 return (PyObject*) str;
6408 }
Tim Peters8f422462000-09-09 06:13:41 +00006409
6410 /* ensure # of chars needed doesn't overflow int and # of bytes
6411 * needed doesn't overflow size_t
6412 */
6413 nchars = len * str->length;
6414 if (len && nchars / len != str->length) {
6415 PyErr_SetString(PyExc_OverflowError,
6416 "repeated string is too long");
6417 return NULL;
6418 }
6419 nbytes = (nchars + 1) * sizeof(Py_UNICODE);
6420 if (nbytes / sizeof(Py_UNICODE) != (size_t)(nchars + 1)) {
6421 PyErr_SetString(PyExc_OverflowError,
6422 "repeated string is too long");
6423 return NULL;
6424 }
6425 u = _PyUnicode_New(nchars);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006426 if (!u)
6427 return NULL;
6428
6429 p = u->str;
6430
Thomas Wouters477c8d52006-05-27 19:21:47 +00006431 if (str->length == 1 && len > 0) {
6432 Py_UNICODE_FILL(p, str->str[0], len);
6433 } else {
6434 Py_ssize_t done = 0; /* number of characters copied this far */
6435 if (done < nchars) {
6436 Py_UNICODE_COPY(p, str->str, str->length);
6437 done = str->length;
6438 }
6439 while (done < nchars) {
6440 int n = (done <= nchars-done) ? done : nchars-done;
6441 Py_UNICODE_COPY(p+done, p, n);
6442 done += n;
6443 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006444 }
6445
6446 return (PyObject*) u;
6447}
6448
6449PyObject *PyUnicode_Replace(PyObject *obj,
6450 PyObject *subobj,
6451 PyObject *replobj,
Martin v. Löwis18e16552006-02-15 17:27:45 +00006452 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006453{
6454 PyObject *self;
6455 PyObject *str1;
6456 PyObject *str2;
6457 PyObject *result;
6458
6459 self = PyUnicode_FromObject(obj);
6460 if (self == NULL)
6461 return NULL;
6462 str1 = PyUnicode_FromObject(subobj);
6463 if (str1 == NULL) {
6464 Py_DECREF(self);
6465 return NULL;
6466 }
6467 str2 = PyUnicode_FromObject(replobj);
6468 if (str2 == NULL) {
6469 Py_DECREF(self);
6470 Py_DECREF(str1);
6471 return NULL;
6472 }
Tim Petersced69f82003-09-16 20:30:58 +00006473 result = replace((PyUnicodeObject *)self,
6474 (PyUnicodeObject *)str1,
6475 (PyUnicodeObject *)str2,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006476 maxcount);
6477 Py_DECREF(self);
6478 Py_DECREF(str1);
6479 Py_DECREF(str2);
6480 return result;
6481}
6482
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006483PyDoc_STRVAR(replace__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006484"S.replace (old, new[, maxsplit]) -> unicode\n\
6485\n\
6486Return a copy of S with all occurrences of substring\n\
6487old replaced by new. If the optional argument maxsplit is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006488given, only the first maxsplit occurrences are replaced.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006489
6490static PyObject*
6491unicode_replace(PyUnicodeObject *self, PyObject *args)
6492{
6493 PyUnicodeObject *str1;
6494 PyUnicodeObject *str2;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006495 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006496 PyObject *result;
6497
Martin v. Löwis18e16552006-02-15 17:27:45 +00006498 if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006499 return NULL;
6500 str1 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str1);
6501 if (str1 == NULL)
6502 return NULL;
6503 str2 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str2);
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00006504 if (str2 == NULL) {
6505 Py_DECREF(str1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006506 return NULL;
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00006507 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006508
6509 result = replace(self, str1, str2, maxcount);
6510
6511 Py_DECREF(str1);
6512 Py_DECREF(str2);
6513 return result;
6514}
6515
6516static
6517PyObject *unicode_repr(PyObject *unicode)
6518{
6519 return unicodeescape_string(PyUnicode_AS_UNICODE(unicode),
6520 PyUnicode_GET_SIZE(unicode),
6521 1);
6522}
6523
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006524PyDoc_STRVAR(rfind__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006525"S.rfind(sub [,start [,end]]) -> int\n\
6526\n\
6527Return the highest index in S where substring sub is found,\n\
6528such that sub is contained within s[start,end]. Optional\n\
6529arguments start and end are interpreted as in slice notation.\n\
6530\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006531Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006532
6533static PyObject *
6534unicode_rfind(PyUnicodeObject *self, PyObject *args)
6535{
Thomas Wouters477c8d52006-05-27 19:21:47 +00006536 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006537 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006538 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006539 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006540
Guido van Rossumb8872e62000-05-09 14:14:27 +00006541 if (!PyArg_ParseTuple(args, "O|O&O&:rfind", &substring,
6542 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006543 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006544 substring = PyUnicode_FromObject(substring);
6545 if (!substring)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006546 return NULL;
6547
Thomas Wouters477c8d52006-05-27 19:21:47 +00006548 result = stringlib_rfind_slice(
6549 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
6550 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
6551 start, end
6552 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006553
6554 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00006555
6556 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006557}
6558
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006559PyDoc_STRVAR(rindex__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006560"S.rindex(sub [,start [,end]]) -> int\n\
6561\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006562Like S.rfind() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006563
6564static PyObject *
6565unicode_rindex(PyUnicodeObject *self, PyObject *args)
6566{
Thomas Wouters477c8d52006-05-27 19:21:47 +00006567 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006568 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006569 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006570 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006571
Guido van Rossumb8872e62000-05-09 14:14:27 +00006572 if (!PyArg_ParseTuple(args, "O|O&O&:rindex", &substring,
6573 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006574 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006575 substring = PyUnicode_FromObject(substring);
6576 if (!substring)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006577 return NULL;
6578
Thomas Wouters477c8d52006-05-27 19:21:47 +00006579 result = stringlib_rfind_slice(
6580 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
6581 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
6582 start, end
6583 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006584
6585 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00006586
Guido van Rossumd57fd912000-03-10 22:53:23 +00006587 if (result < 0) {
6588 PyErr_SetString(PyExc_ValueError, "substring not found");
6589 return NULL;
6590 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00006591 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006592}
6593
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006594PyDoc_STRVAR(rjust__doc__,
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006595"S.rjust(width[, fillchar]) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006596\n\
6597Return S right justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006598done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006599
6600static PyObject *
6601unicode_rjust(PyUnicodeObject *self, PyObject *args)
6602{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006603 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006604 Py_UNICODE fillchar = ' ';
6605
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006606 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006607 return NULL;
6608
Tim Peters7a29bd52001-09-12 03:03:31 +00006609 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006610 Py_INCREF(self);
6611 return (PyObject*) self;
6612 }
6613
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006614 return (PyObject*) pad(self, width - self->length, 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006615}
6616
Guido van Rossumd57fd912000-03-10 22:53:23 +00006617static PyObject*
Martin v. Löwis18e16552006-02-15 17:27:45 +00006618unicode_slice(PyUnicodeObject *self, Py_ssize_t start, Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006619{
6620 /* standard clamping */
6621 if (start < 0)
6622 start = 0;
6623 if (end < 0)
6624 end = 0;
6625 if (end > self->length)
6626 end = self->length;
Tim Peters7a29bd52001-09-12 03:03:31 +00006627 if (start == 0 && end == self->length && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006628 /* full slice, return original string */
6629 Py_INCREF(self);
6630 return (PyObject*) self;
6631 }
6632 if (start > end)
6633 start = end;
6634 /* copy slice */
6635 return (PyObject*) PyUnicode_FromUnicode(self->str + start,
6636 end - start);
6637}
6638
6639PyObject *PyUnicode_Split(PyObject *s,
6640 PyObject *sep,
Martin v. Löwis18e16552006-02-15 17:27:45 +00006641 Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006642{
6643 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00006644
Guido van Rossumd57fd912000-03-10 22:53:23 +00006645 s = PyUnicode_FromObject(s);
6646 if (s == NULL)
6647 return NULL;
6648 if (sep != NULL) {
6649 sep = PyUnicode_FromObject(sep);
6650 if (sep == NULL) {
6651 Py_DECREF(s);
6652 return NULL;
6653 }
6654 }
6655
6656 result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
6657
6658 Py_DECREF(s);
6659 Py_XDECREF(sep);
6660 return result;
6661}
6662
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006663PyDoc_STRVAR(split__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006664"S.split([sep [,maxsplit]]) -> list of strings\n\
6665\n\
6666Return a list of the words in S, using sep as the\n\
6667delimiter string. If maxsplit is given, at most maxsplit\n\
Thomas Hellerca0d2cb2004-09-15 11:41:32 +00006668splits are done. If sep is not specified or is None,\n\
Walter Dörwald782afc52004-09-14 09:40:45 +00006669any whitespace string is a separator.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006670
6671static PyObject*
6672unicode_split(PyUnicodeObject *self, PyObject *args)
6673{
6674 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006675 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006676
Martin v. Löwis18e16552006-02-15 17:27:45 +00006677 if (!PyArg_ParseTuple(args, "|On:split", &substring, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006678 return NULL;
6679
6680 if (substring == Py_None)
6681 return split(self, NULL, maxcount);
6682 else if (PyUnicode_Check(substring))
6683 return split(self, (PyUnicodeObject *)substring, maxcount);
6684 else
6685 return PyUnicode_Split((PyObject *)self, substring, maxcount);
6686}
6687
Thomas Wouters477c8d52006-05-27 19:21:47 +00006688PyObject *
6689PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
6690{
6691 PyObject* str_obj;
6692 PyObject* sep_obj;
6693 PyObject* out;
6694
6695 str_obj = PyUnicode_FromObject(str_in);
6696 if (!str_obj)
6697 return NULL;
6698 sep_obj = PyUnicode_FromObject(sep_in);
6699 if (!sep_obj) {
6700 Py_DECREF(str_obj);
6701 return NULL;
6702 }
6703
6704 out = stringlib_partition(
6705 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
6706 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
6707 );
6708
6709 Py_DECREF(sep_obj);
6710 Py_DECREF(str_obj);
6711
6712 return out;
6713}
6714
6715
6716PyObject *
6717PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
6718{
6719 PyObject* str_obj;
6720 PyObject* sep_obj;
6721 PyObject* out;
6722
6723 str_obj = PyUnicode_FromObject(str_in);
6724 if (!str_obj)
6725 return NULL;
6726 sep_obj = PyUnicode_FromObject(sep_in);
6727 if (!sep_obj) {
6728 Py_DECREF(str_obj);
6729 return NULL;
6730 }
6731
6732 out = stringlib_rpartition(
6733 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
6734 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
6735 );
6736
6737 Py_DECREF(sep_obj);
6738 Py_DECREF(str_obj);
6739
6740 return out;
6741}
6742
6743PyDoc_STRVAR(partition__doc__,
6744"S.partition(sep) -> (head, sep, tail)\n\
6745\n\
6746Searches for the separator sep in S, and returns the part before it,\n\
6747the separator itself, and the part after it. If the separator is not\n\
6748found, returns S and two empty strings.");
6749
6750static PyObject*
6751unicode_partition(PyUnicodeObject *self, PyObject *separator)
6752{
6753 return PyUnicode_Partition((PyObject *)self, separator);
6754}
6755
6756PyDoc_STRVAR(rpartition__doc__,
Thomas Wouters89f507f2006-12-13 04:49:30 +00006757"S.rpartition(sep) -> (tail, sep, head)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00006758\n\
6759Searches for the separator sep in S, starting at the end of S, and returns\n\
6760the part before it, the separator itself, and the part after it. If the\n\
Thomas Wouters89f507f2006-12-13 04:49:30 +00006761separator is not found, returns two empty strings and S.");
Thomas Wouters477c8d52006-05-27 19:21:47 +00006762
6763static PyObject*
6764unicode_rpartition(PyUnicodeObject *self, PyObject *separator)
6765{
6766 return PyUnicode_RPartition((PyObject *)self, separator);
6767}
6768
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006769PyObject *PyUnicode_RSplit(PyObject *s,
6770 PyObject *sep,
Martin v. Löwis18e16552006-02-15 17:27:45 +00006771 Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006772{
6773 PyObject *result;
6774
6775 s = PyUnicode_FromObject(s);
6776 if (s == NULL)
6777 return NULL;
6778 if (sep != NULL) {
6779 sep = PyUnicode_FromObject(sep);
6780 if (sep == NULL) {
6781 Py_DECREF(s);
6782 return NULL;
6783 }
6784 }
6785
6786 result = rsplit((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
6787
6788 Py_DECREF(s);
6789 Py_XDECREF(sep);
6790 return result;
6791}
6792
6793PyDoc_STRVAR(rsplit__doc__,
6794"S.rsplit([sep [,maxsplit]]) -> list of strings\n\
6795\n\
6796Return a list of the words in S, using sep as the\n\
6797delimiter string, starting at the end of the string and\n\
6798working to the front. If maxsplit is given, at most maxsplit\n\
6799splits are done. If sep is not specified, any whitespace string\n\
6800is a separator.");
6801
6802static PyObject*
6803unicode_rsplit(PyUnicodeObject *self, PyObject *args)
6804{
6805 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006806 Py_ssize_t maxcount = -1;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006807
Martin v. Löwis18e16552006-02-15 17:27:45 +00006808 if (!PyArg_ParseTuple(args, "|On:rsplit", &substring, &maxcount))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006809 return NULL;
6810
6811 if (substring == Py_None)
6812 return rsplit(self, NULL, maxcount);
6813 else if (PyUnicode_Check(substring))
6814 return rsplit(self, (PyUnicodeObject *)substring, maxcount);
6815 else
6816 return PyUnicode_RSplit((PyObject *)self, substring, maxcount);
6817}
6818
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006819PyDoc_STRVAR(splitlines__doc__,
Guido van Rossum86662912000-04-11 15:38:46 +00006820"S.splitlines([keepends]]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006821\n\
6822Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +00006823Line breaks are not included in the resulting list unless keepends\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006824is given and true.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006825
6826static PyObject*
6827unicode_splitlines(PyUnicodeObject *self, PyObject *args)
6828{
Guido van Rossum86662912000-04-11 15:38:46 +00006829 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006830
Guido van Rossum86662912000-04-11 15:38:46 +00006831 if (!PyArg_ParseTuple(args, "|i:splitlines", &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006832 return NULL;
6833
Guido van Rossum86662912000-04-11 15:38:46 +00006834 return PyUnicode_Splitlines((PyObject *)self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006835}
6836
6837static
Guido van Rossumf15a29f2007-05-04 00:41:39 +00006838PyObject *unicode_str(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006839{
Guido van Rossumf15a29f2007-05-04 00:41:39 +00006840 PyObject *res = _PyUnicode_AsDefaultEncodedString(self, NULL);
6841 Py_XINCREF(res);
6842 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006843}
6844
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006845PyDoc_STRVAR(swapcase__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006846"S.swapcase() -> unicode\n\
6847\n\
6848Return a copy of S with uppercase characters converted to lowercase\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006849and vice versa.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006850
6851static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006852unicode_swapcase(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006853{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006854 return fixup(self, fixswapcase);
6855}
6856
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006857PyDoc_STRVAR(translate__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006858"S.translate(table) -> unicode\n\
6859\n\
6860Return a copy of the string S, where all characters have been mapped\n\
6861through the given translation table, which must be a mapping of\n\
Walter Dörwald5c1ee172002-09-04 20:31:32 +00006862Unicode ordinals to Unicode ordinals, Unicode strings or None.\n\
6863Unmapped characters are left untouched. Characters mapped to None\n\
6864are deleted.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006865
6866static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006867unicode_translate(PyUnicodeObject *self, PyObject *table)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006868{
Tim Petersced69f82003-09-16 20:30:58 +00006869 return PyUnicode_TranslateCharmap(self->str,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006870 self->length,
Tim Petersced69f82003-09-16 20:30:58 +00006871 table,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006872 "ignore");
6873}
6874
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006875PyDoc_STRVAR(upper__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006876"S.upper() -> unicode\n\
6877\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006878Return a copy of S converted to uppercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006879
6880static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006881unicode_upper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006882{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006883 return fixup(self, fixupper);
6884}
6885
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006886PyDoc_STRVAR(zfill__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006887"S.zfill(width) -> unicode\n\
6888\n\
6889Pad a numeric string x with zeros on the left, to fill a field\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006890of the specified width. The string x is never truncated.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006891
6892static PyObject *
6893unicode_zfill(PyUnicodeObject *self, PyObject *args)
6894{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006895 Py_ssize_t fill;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006896 PyUnicodeObject *u;
6897
Martin v. Löwis18e16552006-02-15 17:27:45 +00006898 Py_ssize_t width;
6899 if (!PyArg_ParseTuple(args, "n:zfill", &width))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006900 return NULL;
6901
6902 if (self->length >= width) {
Walter Dörwald0fe940c2002-04-15 18:42:15 +00006903 if (PyUnicode_CheckExact(self)) {
6904 Py_INCREF(self);
6905 return (PyObject*) self;
6906 }
6907 else
6908 return PyUnicode_FromUnicode(
6909 PyUnicode_AS_UNICODE(self),
6910 PyUnicode_GET_SIZE(self)
6911 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006912 }
6913
6914 fill = width - self->length;
6915
6916 u = pad(self, fill, 0, '0');
6917
Walter Dörwald068325e2002-04-15 13:36:47 +00006918 if (u == NULL)
6919 return NULL;
6920
Guido van Rossumd57fd912000-03-10 22:53:23 +00006921 if (u->str[fill] == '+' || u->str[fill] == '-') {
6922 /* move sign to beginning of string */
6923 u->str[0] = u->str[fill];
6924 u->str[fill] = '0';
6925 }
6926
6927 return (PyObject*) u;
6928}
Guido van Rossumd57fd912000-03-10 22:53:23 +00006929
6930#if 0
6931static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006932unicode_freelistsize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006933{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006934 return PyInt_FromLong(unicode_freelist_size);
6935}
6936#endif
6937
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006938PyDoc_STRVAR(startswith__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006939"S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006940\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00006941Return True if S starts with the specified prefix, False otherwise.\n\
6942With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006943With optional end, stop comparing S at that position.\n\
6944prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006945
6946static PyObject *
6947unicode_startswith(PyUnicodeObject *self,
6948 PyObject *args)
6949{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006950 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006951 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006952 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006953 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006954 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006955
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006956 if (!PyArg_ParseTuple(args, "O|O&O&:startswith", &subobj,
Guido van Rossumb8872e62000-05-09 14:14:27 +00006957 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006958 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006959 if (PyTuple_Check(subobj)) {
6960 Py_ssize_t i;
6961 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
6962 substring = (PyUnicodeObject *)PyUnicode_FromObject(
6963 PyTuple_GET_ITEM(subobj, i));
6964 if (substring == NULL)
6965 return NULL;
6966 result = tailmatch(self, substring, start, end, -1);
6967 Py_DECREF(substring);
6968 if (result) {
6969 Py_RETURN_TRUE;
6970 }
6971 }
6972 /* nothing matched */
6973 Py_RETURN_FALSE;
6974 }
6975 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006976 if (substring == NULL)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006977 return NULL;
6978 result = tailmatch(self, substring, start, end, -1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006979 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006980 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006981}
6982
6983
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006984PyDoc_STRVAR(endswith__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006985"S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006986\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00006987Return True if S ends with the specified suffix, False otherwise.\n\
6988With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006989With optional end, stop comparing S at that position.\n\
6990suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006991
6992static PyObject *
6993unicode_endswith(PyUnicodeObject *self,
6994 PyObject *args)
6995{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006996 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006997 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006998 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006999 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007000 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007001
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007002 if (!PyArg_ParseTuple(args, "O|O&O&:endswith", &subobj,
7003 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007004 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007005 if (PyTuple_Check(subobj)) {
7006 Py_ssize_t i;
7007 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
7008 substring = (PyUnicodeObject *)PyUnicode_FromObject(
7009 PyTuple_GET_ITEM(subobj, i));
7010 if (substring == NULL)
7011 return NULL;
7012 result = tailmatch(self, substring, start, end, +1);
7013 Py_DECREF(substring);
7014 if (result) {
7015 Py_RETURN_TRUE;
7016 }
7017 }
7018 Py_RETURN_FALSE;
7019 }
7020 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007021 if (substring == NULL)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007022 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007023
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007024 result = tailmatch(self, substring, start, end, +1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007025 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007026 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007027}
7028
7029
Guido van Rossum5d9113d2003-01-29 17:58:45 +00007030
7031static PyObject *
7032unicode_getnewargs(PyUnicodeObject *v)
7033{
7034 return Py_BuildValue("(u#)", v->str, v->length);
7035}
7036
7037
Guido van Rossumd57fd912000-03-10 22:53:23 +00007038static PyMethodDef unicode_methods[] = {
7039
7040 /* Order is according to common usage: often used methods should
7041 appear first, since lookup is done sequentially. */
7042
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007043 {"encode", (PyCFunction) unicode_encode, METH_VARARGS, encode__doc__},
7044 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
7045 {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007046 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007047 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
7048 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
7049 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
7050 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
7051 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
7052 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
7053 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +00007054 {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007055 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
7056 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
7057 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007058 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00007059 {"decode", (PyCFunction) unicode_decode, METH_VARARGS, decode__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007060/* {"maketrans", (PyCFunction) unicode_maketrans, METH_VARARGS, maketrans__doc__}, */
7061 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
7062 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
7063 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007064 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +00007065 {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007066 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS, splitlines__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007067 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007068 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
7069 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
7070 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
7071 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
7072 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
7073 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
7074 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
7075 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
7076 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
7077 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
7078 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
7079 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
7080 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
7081 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007082 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
Walter Dörwald068325e2002-04-15 13:36:47 +00007083#if 0
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007084 {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
Guido van Rossumd57fd912000-03-10 22:53:23 +00007085#endif
7086
7087#if 0
7088 /* This one is just used for debugging the implementation. */
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007089 {"freelistsize", (PyCFunction) unicode_freelistsize, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00007090#endif
7091
Guido van Rossum5d9113d2003-01-29 17:58:45 +00007092 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00007093 {NULL, NULL}
7094};
7095
Neil Schemenauerce30bc92002-11-18 16:10:18 +00007096static PyObject *
7097unicode_mod(PyObject *v, PyObject *w)
7098{
7099 if (!PyUnicode_Check(v)) {
7100 Py_INCREF(Py_NotImplemented);
7101 return Py_NotImplemented;
7102 }
7103 return PyUnicode_Format(v, w);
7104}
7105
7106static PyNumberMethods unicode_as_number = {
7107 0, /*nb_add*/
7108 0, /*nb_subtract*/
7109 0, /*nb_multiply*/
Neil Schemenauerce30bc92002-11-18 16:10:18 +00007110 unicode_mod, /*nb_remainder*/
7111};
7112
Guido van Rossumd57fd912000-03-10 22:53:23 +00007113static PySequenceMethods unicode_as_sequence = {
Martin v. Löwis18e16552006-02-15 17:27:45 +00007114 (lenfunc) unicode_length, /* sq_length */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007115 PyUnicode_Concat, /* sq_concat */
Martin v. Löwis18e16552006-02-15 17:27:45 +00007116 (ssizeargfunc) unicode_repeat, /* sq_repeat */
7117 (ssizeargfunc) unicode_getitem, /* sq_item */
7118 (ssizessizeargfunc) unicode_slice, /* sq_slice */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007119 0, /* sq_ass_item */
7120 0, /* sq_ass_slice */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007121 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007122};
7123
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007124static PyObject*
7125unicode_subscript(PyUnicodeObject* self, PyObject* item)
7126{
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00007127 if (PyIndex_Check(item)) {
7128 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007129 if (i == -1 && PyErr_Occurred())
7130 return NULL;
7131 if (i < 0)
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007132 i += PyUnicode_GET_SIZE(self);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007133 return unicode_getitem(self, i);
7134 } else if (PySlice_Check(item)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00007135 Py_ssize_t start, stop, step, slicelength, cur, i;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007136 Py_UNICODE* source_buf;
7137 Py_UNICODE* result_buf;
7138 PyObject* result;
7139
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007140 if (PySlice_GetIndicesEx((PySliceObject*)item, PyUnicode_GET_SIZE(self),
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007141 &start, &stop, &step, &slicelength) < 0) {
7142 return NULL;
7143 }
7144
7145 if (slicelength <= 0) {
7146 return PyUnicode_FromUnicode(NULL, 0);
7147 } else {
7148 source_buf = PyUnicode_AS_UNICODE((PyObject*)self);
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007149 result_buf = (Py_UNICODE *)PyMem_MALLOC(slicelength*
7150 sizeof(Py_UNICODE));
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007151
7152 if (result_buf == NULL)
7153 return PyErr_NoMemory();
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007154
7155 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
7156 result_buf[i] = source_buf[cur];
7157 }
Tim Petersced69f82003-09-16 20:30:58 +00007158
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007159 result = PyUnicode_FromUnicode(result_buf, slicelength);
7160 PyMem_FREE(result_buf);
7161 return result;
7162 }
7163 } else {
7164 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
7165 return NULL;
7166 }
7167}
7168
7169static PyMappingMethods unicode_as_mapping = {
Martin v. Löwis18e16552006-02-15 17:27:45 +00007170 (lenfunc)unicode_length, /* mp_length */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007171 (binaryfunc)unicode_subscript, /* mp_subscript */
7172 (objobjargproc)0, /* mp_ass_subscript */
7173};
7174
Martin v. Löwis18e16552006-02-15 17:27:45 +00007175static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +00007176unicode_buffer_getreadbuf(PyUnicodeObject *self,
Martin v. Löwis18e16552006-02-15 17:27:45 +00007177 Py_ssize_t index,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007178 const void **ptr)
7179{
7180 if (index != 0) {
7181 PyErr_SetString(PyExc_SystemError,
7182 "accessing non-existent unicode segment");
7183 return -1;
7184 }
7185 *ptr = (void *) self->str;
7186 return PyUnicode_GET_DATA_SIZE(self);
7187}
7188
Martin v. Löwis18e16552006-02-15 17:27:45 +00007189static Py_ssize_t
7190unicode_buffer_getwritebuf(PyUnicodeObject *self, Py_ssize_t index,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007191 const void **ptr)
7192{
7193 PyErr_SetString(PyExc_TypeError,
Neal Norwitz20e72132002-06-13 21:25:17 +00007194 "cannot use unicode as modifiable buffer");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007195 return -1;
7196}
7197
7198static int
7199unicode_buffer_getsegcount(PyUnicodeObject *self,
Martin v. Löwis18e16552006-02-15 17:27:45 +00007200 Py_ssize_t *lenp)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007201{
7202 if (lenp)
7203 *lenp = PyUnicode_GET_DATA_SIZE(self);
7204 return 1;
7205}
7206
Martin v. Löwiseb079f12006-02-16 14:32:27 +00007207static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +00007208unicode_buffer_getcharbuf(PyUnicodeObject *self,
Martin v. Löwis18e16552006-02-15 17:27:45 +00007209 Py_ssize_t index,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007210 const void **ptr)
7211{
7212 PyObject *str;
Tim Petersced69f82003-09-16 20:30:58 +00007213
Guido van Rossumd57fd912000-03-10 22:53:23 +00007214 if (index != 0) {
7215 PyErr_SetString(PyExc_SystemError,
7216 "accessing non-existent unicode segment");
7217 return -1;
7218 }
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00007219 str = _PyUnicode_AsDefaultEncodedString((PyObject *)self, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007220 if (str == NULL)
7221 return -1;
7222 *ptr = (void *) PyString_AS_STRING(str);
7223 return PyString_GET_SIZE(str);
7224}
7225
7226/* Helpers for PyUnicode_Format() */
7227
7228static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +00007229getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007230{
Martin v. Löwis18e16552006-02-15 17:27:45 +00007231 Py_ssize_t argidx = *p_argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007232 if (argidx < arglen) {
7233 (*p_argidx)++;
7234 if (arglen < 0)
7235 return args;
7236 else
7237 return PyTuple_GetItem(args, argidx);
7238 }
7239 PyErr_SetString(PyExc_TypeError,
7240 "not enough arguments for format string");
7241 return NULL;
7242}
7243
7244#define F_LJUST (1<<0)
7245#define F_SIGN (1<<1)
7246#define F_BLANK (1<<2)
7247#define F_ALT (1<<3)
7248#define F_ZERO (1<<4)
7249
Martin v. Löwis18e16552006-02-15 17:27:45 +00007250static Py_ssize_t
Neal Norwitzfc76d632006-01-10 06:03:13 +00007251strtounicode(Py_UNICODE *buffer, const char *charbuffer)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007252{
Martin v. Löwis18e16552006-02-15 17:27:45 +00007253 register Py_ssize_t i;
7254 Py_ssize_t len = strlen(charbuffer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007255 for (i = len - 1; i >= 0; i--)
7256 buffer[i] = (Py_UNICODE) charbuffer[i];
7257
Guido van Rossumd57fd912000-03-10 22:53:23 +00007258 return len;
7259}
7260
Neal Norwitzfc76d632006-01-10 06:03:13 +00007261static int
7262doubletounicode(Py_UNICODE *buffer, size_t len, const char *format, double x)
7263{
Tim Peters15231542006-02-16 01:08:01 +00007264 Py_ssize_t result;
7265
Neal Norwitzfc76d632006-01-10 06:03:13 +00007266 PyOS_ascii_formatd((char *)buffer, len, format, x);
Tim Peters15231542006-02-16 01:08:01 +00007267 result = strtounicode(buffer, (char *)buffer);
7268 return Py_SAFE_DOWNCAST(result, Py_ssize_t, int);
Neal Norwitzfc76d632006-01-10 06:03:13 +00007269}
7270
7271static int
7272longtounicode(Py_UNICODE *buffer, size_t len, const char *format, long x)
7273{
Tim Peters15231542006-02-16 01:08:01 +00007274 Py_ssize_t result;
7275
Neal Norwitzfc76d632006-01-10 06:03:13 +00007276 PyOS_snprintf((char *)buffer, len, format, x);
Tim Peters15231542006-02-16 01:08:01 +00007277 result = strtounicode(buffer, (char *)buffer);
7278 return Py_SAFE_DOWNCAST(result, Py_ssize_t, int);
Neal Norwitzfc76d632006-01-10 06:03:13 +00007279}
7280
Guido van Rossum078151d2002-08-11 04:24:12 +00007281/* XXX To save some code duplication, formatfloat/long/int could have been
7282 shared with stringobject.c, converting from 8-bit to Unicode after the
7283 formatting is done. */
7284
Guido van Rossumd57fd912000-03-10 22:53:23 +00007285static int
7286formatfloat(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007287 size_t buflen,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007288 int flags,
7289 int prec,
7290 int type,
7291 PyObject *v)
7292{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007293 /* fmt = '%#.' + `prec` + `type`
7294 worst case length = 3 + 10 (len of INT_MAX) + 1 = 14 (use 20)*/
Guido van Rossumd57fd912000-03-10 22:53:23 +00007295 char fmt[20];
7296 double x;
Tim Petersced69f82003-09-16 20:30:58 +00007297
Guido van Rossumd57fd912000-03-10 22:53:23 +00007298 x = PyFloat_AsDouble(v);
7299 if (x == -1.0 && PyErr_Occurred())
7300 return -1;
7301 if (prec < 0)
7302 prec = 6;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007303 if (type == 'f' && (fabs(x) / 1e25) >= 1e25)
7304 type = 'g';
Marc-André Lemburg79f57832002-12-29 19:44:06 +00007305 /* Worst case length calc to ensure no buffer overrun:
7306
7307 'g' formats:
7308 fmt = %#.<prec>g
7309 buf = '-' + [0-9]*prec + '.' + 'e+' + (longest exp
7310 for any double rep.)
7311 len = 1 + prec + 1 + 2 + 5 = 9 + prec
7312
7313 'f' formats:
7314 buf = '-' + [0-9]*x + '.' + [0-9]*prec (with x < 50)
7315 len = 1 + 50 + 1 + prec = 52 + prec
7316
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007317 If prec=0 the effective precision is 1 (the leading digit is
Tim Petersced69f82003-09-16 20:30:58 +00007318 always given), therefore increase the length by one.
Marc-André Lemburg79f57832002-12-29 19:44:06 +00007319
7320 */
7321 if ((type == 'g' && buflen <= (size_t)10 + (size_t)prec) ||
7322 (type == 'f' && buflen <= (size_t)53 + (size_t)prec)) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007323 PyErr_SetString(PyExc_OverflowError,
Marc-André Lemburg79f57832002-12-29 19:44:06 +00007324 "formatted float is too long (precision too large?)");
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007325 return -1;
7326 }
Marc-André Lemburg79f57832002-12-29 19:44:06 +00007327 PyOS_snprintf(fmt, sizeof(fmt), "%%%s.%d%c",
7328 (flags&F_ALT) ? "#" : "",
7329 prec, type);
Neal Norwitzfc76d632006-01-10 06:03:13 +00007330 return doubletounicode(buf, buflen, fmt, x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007331}
7332
Tim Peters38fd5b62000-09-21 05:43:11 +00007333static PyObject*
7334formatlong(PyObject *val, int flags, int prec, int type)
7335{
7336 char *buf;
7337 int i, len;
7338 PyObject *str; /* temporary string object. */
7339 PyUnicodeObject *result;
7340
7341 str = _PyString_FormatLong(val, flags, prec, type, &buf, &len);
7342 if (!str)
7343 return NULL;
7344 result = _PyUnicode_New(len);
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00007345 if (!result) {
7346 Py_DECREF(str);
7347 return NULL;
7348 }
Tim Peters38fd5b62000-09-21 05:43:11 +00007349 for (i = 0; i < len; i++)
7350 result->str[i] = buf[i];
7351 result->str[len] = 0;
7352 Py_DECREF(str);
7353 return (PyObject*)result;
7354}
7355
Guido van Rossumd57fd912000-03-10 22:53:23 +00007356static int
7357formatint(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007358 size_t buflen,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007359 int flags,
7360 int prec,
7361 int type,
7362 PyObject *v)
7363{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007364 /* fmt = '%#.' + `prec` + 'l' + `type`
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00007365 * worst case length = 3 + 19 (worst len of INT_MAX on 64-bit machine)
7366 * + 1 + 1
7367 * = 24
7368 */
Tim Peters38fd5b62000-09-21 05:43:11 +00007369 char fmt[64]; /* plenty big enough! */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00007370 char *sign;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007371 long x;
7372
7373 x = PyInt_AsLong(v);
7374 if (x == -1 && PyErr_Occurred())
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00007375 return -1;
Guido van Rossum6c9e1302003-11-29 23:52:13 +00007376 if (x < 0 && type == 'u') {
7377 type = 'd';
Guido van Rossum078151d2002-08-11 04:24:12 +00007378 }
Guido van Rossum6c9e1302003-11-29 23:52:13 +00007379 if (x < 0 && (type == 'x' || type == 'X' || type == 'o'))
7380 sign = "-";
7381 else
7382 sign = "";
Guido van Rossumd57fd912000-03-10 22:53:23 +00007383 if (prec < 0)
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00007384 prec = 1;
7385
Guido van Rossum6c9e1302003-11-29 23:52:13 +00007386 /* buf = '+'/'-'/'' + '0'/'0x'/'' + '[0-9]'*max(prec, len(x in octal))
7387 * worst case buf = '-0x' + [0-9]*prec, where prec >= 11
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00007388 */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00007389 if (buflen <= 14 || buflen <= (size_t)3 + (size_t)prec) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007390 PyErr_SetString(PyExc_OverflowError,
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00007391 "formatted integer is too long (precision too large?)");
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007392 return -1;
7393 }
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00007394
7395 if ((flags & F_ALT) &&
7396 (type == 'x' || type == 'X')) {
Tim Petersced69f82003-09-16 20:30:58 +00007397 /* When converting under %#x or %#X, there are a number
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00007398 * of issues that cause pain:
7399 * - when 0 is being converted, the C standard leaves off
7400 * the '0x' or '0X', which is inconsistent with other
7401 * %#x/%#X conversions and inconsistent with Python's
7402 * hex() function
7403 * - there are platforms that violate the standard and
7404 * convert 0 with the '0x' or '0X'
7405 * (Metrowerks, Compaq Tru64)
7406 * - there are platforms that give '0x' when converting
Tim Petersced69f82003-09-16 20:30:58 +00007407 * under %#X, but convert 0 in accordance with the
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00007408 * standard (OS/2 EMX)
Tim Petersced69f82003-09-16 20:30:58 +00007409 *
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00007410 * We can achieve the desired consistency by inserting our
7411 * own '0x' or '0X' prefix, and substituting %x/%X in place
7412 * of %#x/%#X.
7413 *
7414 * Note that this is the same approach as used in
7415 * formatint() in stringobject.c
Andrew MacIntyrec4874392002-02-26 11:36:35 +00007416 */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00007417 PyOS_snprintf(fmt, sizeof(fmt), "%s0%c%%.%dl%c",
7418 sign, type, prec, type);
Andrew MacIntyrec4874392002-02-26 11:36:35 +00007419 }
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00007420 else {
Guido van Rossum6c9e1302003-11-29 23:52:13 +00007421 PyOS_snprintf(fmt, sizeof(fmt), "%s%%%s.%dl%c",
7422 sign, (flags&F_ALT) ? "#" : "",
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00007423 prec, type);
Tim Petersb3d8d1f2001-04-28 05:38:26 +00007424 }
Guido van Rossum6c9e1302003-11-29 23:52:13 +00007425 if (sign[0])
Neal Norwitzfc76d632006-01-10 06:03:13 +00007426 return longtounicode(buf, buflen, fmt, -x);
Guido van Rossum6c9e1302003-11-29 23:52:13 +00007427 else
Neal Norwitzfc76d632006-01-10 06:03:13 +00007428 return longtounicode(buf, buflen, fmt, x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007429}
7430
7431static int
7432formatchar(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007433 size_t buflen,
7434 PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007435{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007436 /* presume that the buffer is at least 2 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00007437 if (PyUnicode_Check(v)) {
7438 if (PyUnicode_GET_SIZE(v) != 1)
7439 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007440 buf[0] = PyUnicode_AS_UNICODE(v)[0];
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00007441 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007442
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00007443 else if (PyString_Check(v)) {
Tim Petersced69f82003-09-16 20:30:58 +00007444 if (PyString_GET_SIZE(v) != 1)
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00007445 goto onError;
7446 buf[0] = (Py_UNICODE)PyString_AS_STRING(v)[0];
7447 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007448
7449 else {
7450 /* Integer input truncated to a character */
7451 long x;
7452 x = PyInt_AsLong(v);
7453 if (x == -1 && PyErr_Occurred())
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00007454 goto onError;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00007455#ifdef Py_UNICODE_WIDE
7456 if (x < 0 || x > 0x10ffff) {
Walter Dörwald44f527f2003-04-02 16:37:24 +00007457 PyErr_SetString(PyExc_OverflowError,
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00007458 "%c arg not in range(0x110000) "
7459 "(wide Python build)");
7460 return -1;
7461 }
7462#else
7463 if (x < 0 || x > 0xffff) {
Walter Dörwald44f527f2003-04-02 16:37:24 +00007464 PyErr_SetString(PyExc_OverflowError,
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00007465 "%c arg not in range(0x10000) "
7466 "(narrow Python build)");
7467 return -1;
7468 }
7469#endif
7470 buf[0] = (Py_UNICODE) x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007471 }
7472 buf[1] = '\0';
7473 return 1;
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00007474
7475 onError:
7476 PyErr_SetString(PyExc_TypeError,
7477 "%c requires int or char");
7478 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007479}
7480
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007481/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
7482
7483 FORMATBUFLEN is the length of the buffer in which the floats, ints, &
7484 chars are formatted. XXX This is a magic number. Each formatting
7485 routine does bounds checking to ensure no overflow, but a better
7486 solution may be to malloc a buffer of appropriate size for each
7487 format. For now, the current solution is sufficient.
7488*/
7489#define FORMATBUFLEN (size_t)120
7490
Guido van Rossumd57fd912000-03-10 22:53:23 +00007491PyObject *PyUnicode_Format(PyObject *format,
7492 PyObject *args)
7493{
7494 Py_UNICODE *fmt, *res;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007495 Py_ssize_t fmtcnt, rescnt, reslen, arglen, argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007496 int args_owned = 0;
7497 PyUnicodeObject *result = NULL;
7498 PyObject *dict = NULL;
7499 PyObject *uformat;
Tim Petersced69f82003-09-16 20:30:58 +00007500
Guido van Rossumd57fd912000-03-10 22:53:23 +00007501 if (format == NULL || args == NULL) {
7502 PyErr_BadInternalCall();
7503 return NULL;
7504 }
7505 uformat = PyUnicode_FromObject(format);
Fred Drakee4315f52000-05-09 19:53:39 +00007506 if (uformat == NULL)
7507 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007508 fmt = PyUnicode_AS_UNICODE(uformat);
7509 fmtcnt = PyUnicode_GET_SIZE(uformat);
7510
7511 reslen = rescnt = fmtcnt + 100;
7512 result = _PyUnicode_New(reslen);
7513 if (result == NULL)
7514 goto onError;
7515 res = PyUnicode_AS_UNICODE(result);
7516
7517 if (PyTuple_Check(args)) {
7518 arglen = PyTuple_Size(args);
7519 argidx = 0;
7520 }
7521 else {
7522 arglen = -1;
7523 argidx = -2;
7524 }
Neal Norwitz80a1bf42002-11-12 23:01:12 +00007525 if (args->ob_type->tp_as_mapping && !PyTuple_Check(args) &&
7526 !PyObject_TypeCheck(args, &PyBaseString_Type))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007527 dict = args;
7528
7529 while (--fmtcnt >= 0) {
7530 if (*fmt != '%') {
7531 if (--rescnt < 0) {
7532 rescnt = fmtcnt + 100;
7533 reslen += rescnt;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00007534 if (_PyUnicode_Resize(&result, reslen) < 0)
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00007535 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007536 res = PyUnicode_AS_UNICODE(result) + reslen - rescnt;
7537 --rescnt;
7538 }
7539 *res++ = *fmt++;
7540 }
7541 else {
7542 /* Got a format specifier */
7543 int flags = 0;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007544 Py_ssize_t width = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007545 int prec = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007546 Py_UNICODE c = '\0';
7547 Py_UNICODE fill;
7548 PyObject *v = NULL;
7549 PyObject *temp = NULL;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007550 Py_UNICODE *pbuf;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007551 Py_UNICODE sign;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007552 Py_ssize_t len;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007553 Py_UNICODE formatbuf[FORMATBUFLEN]; /* For format{float,int,char}() */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007554
7555 fmt++;
7556 if (*fmt == '(') {
7557 Py_UNICODE *keystart;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007558 Py_ssize_t keylen;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007559 PyObject *key;
7560 int pcount = 1;
7561
7562 if (dict == NULL) {
7563 PyErr_SetString(PyExc_TypeError,
Tim Petersced69f82003-09-16 20:30:58 +00007564 "format requires a mapping");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007565 goto onError;
7566 }
7567 ++fmt;
7568 --fmtcnt;
7569 keystart = fmt;
7570 /* Skip over balanced parentheses */
7571 while (pcount > 0 && --fmtcnt >= 0) {
7572 if (*fmt == ')')
7573 --pcount;
7574 else if (*fmt == '(')
7575 ++pcount;
7576 fmt++;
7577 }
7578 keylen = fmt - keystart - 1;
7579 if (fmtcnt < 0 || pcount > 0) {
7580 PyErr_SetString(PyExc_ValueError,
7581 "incomplete format key");
7582 goto onError;
7583 }
Marc-André Lemburg72f82132001-11-20 15:18:49 +00007584#if 0
Fred Drakee4315f52000-05-09 19:53:39 +00007585 /* keys are converted to strings using UTF-8 and
Guido van Rossumd57fd912000-03-10 22:53:23 +00007586 then looked up since Python uses strings to hold
7587 variables names etc. in its namespaces and we
Fred Drakee4315f52000-05-09 19:53:39 +00007588 wouldn't want to break common idioms. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007589 key = PyUnicode_EncodeUTF8(keystart,
7590 keylen,
7591 NULL);
Marc-André Lemburg72f82132001-11-20 15:18:49 +00007592#else
7593 key = PyUnicode_FromUnicode(keystart, keylen);
7594#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00007595 if (key == NULL)
7596 goto onError;
7597 if (args_owned) {
7598 Py_DECREF(args);
7599 args_owned = 0;
7600 }
7601 args = PyObject_GetItem(dict, key);
7602 Py_DECREF(key);
7603 if (args == NULL) {
7604 goto onError;
7605 }
7606 args_owned = 1;
7607 arglen = -1;
7608 argidx = -2;
7609 }
7610 while (--fmtcnt >= 0) {
7611 switch (c = *fmt++) {
7612 case '-': flags |= F_LJUST; continue;
7613 case '+': flags |= F_SIGN; continue;
7614 case ' ': flags |= F_BLANK; continue;
7615 case '#': flags |= F_ALT; continue;
7616 case '0': flags |= F_ZERO; continue;
7617 }
7618 break;
7619 }
7620 if (c == '*') {
7621 v = getnextarg(args, arglen, &argidx);
7622 if (v == NULL)
7623 goto onError;
7624 if (!PyInt_Check(v)) {
7625 PyErr_SetString(PyExc_TypeError,
7626 "* wants int");
7627 goto onError;
7628 }
7629 width = PyInt_AsLong(v);
Guido van Rossumddefaf32007-01-14 03:31:43 +00007630 if (width == -1 && PyErr_Occurred())
7631 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007632 if (width < 0) {
7633 flags |= F_LJUST;
7634 width = -width;
7635 }
7636 if (--fmtcnt >= 0)
7637 c = *fmt++;
7638 }
7639 else if (c >= '0' && c <= '9') {
7640 width = c - '0';
7641 while (--fmtcnt >= 0) {
7642 c = *fmt++;
7643 if (c < '0' || c > '9')
7644 break;
7645 if ((width*10) / 10 != width) {
7646 PyErr_SetString(PyExc_ValueError,
7647 "width too big");
7648 goto onError;
7649 }
7650 width = width*10 + (c - '0');
7651 }
7652 }
7653 if (c == '.') {
7654 prec = 0;
7655 if (--fmtcnt >= 0)
7656 c = *fmt++;
7657 if (c == '*') {
7658 v = getnextarg(args, arglen, &argidx);
7659 if (v == NULL)
7660 goto onError;
7661 if (!PyInt_Check(v)) {
7662 PyErr_SetString(PyExc_TypeError,
7663 "* wants int");
7664 goto onError;
7665 }
7666 prec = PyInt_AsLong(v);
Guido van Rossumddefaf32007-01-14 03:31:43 +00007667 if (prec == -1 && PyErr_Occurred())
7668 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007669 if (prec < 0)
7670 prec = 0;
7671 if (--fmtcnt >= 0)
7672 c = *fmt++;
7673 }
7674 else if (c >= '0' && c <= '9') {
7675 prec = c - '0';
7676 while (--fmtcnt >= 0) {
7677 c = Py_CHARMASK(*fmt++);
7678 if (c < '0' || c > '9')
7679 break;
7680 if ((prec*10) / 10 != prec) {
7681 PyErr_SetString(PyExc_ValueError,
7682 "prec too big");
7683 goto onError;
7684 }
7685 prec = prec*10 + (c - '0');
7686 }
7687 }
7688 } /* prec */
7689 if (fmtcnt >= 0) {
7690 if (c == 'h' || c == 'l' || c == 'L') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007691 if (--fmtcnt >= 0)
7692 c = *fmt++;
7693 }
7694 }
7695 if (fmtcnt < 0) {
7696 PyErr_SetString(PyExc_ValueError,
7697 "incomplete format");
7698 goto onError;
7699 }
7700 if (c != '%') {
7701 v = getnextarg(args, arglen, &argidx);
7702 if (v == NULL)
7703 goto onError;
7704 }
7705 sign = 0;
7706 fill = ' ';
7707 switch (c) {
7708
7709 case '%':
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007710 pbuf = formatbuf;
7711 /* presume that buffer length is at least 1 */
7712 pbuf[0] = '%';
Guido van Rossumd57fd912000-03-10 22:53:23 +00007713 len = 1;
7714 break;
7715
7716 case 's':
7717 case 'r':
7718 if (PyUnicode_Check(v) && c == 's') {
7719 temp = v;
7720 Py_INCREF(temp);
7721 }
7722 else {
7723 PyObject *unicode;
7724 if (c == 's')
Marc-André Lemburgd25c6502004-07-23 16:13:25 +00007725 temp = PyObject_Unicode(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007726 else
7727 temp = PyObject_Repr(v);
7728 if (temp == NULL)
7729 goto onError;
Marc-André Lemburgd25c6502004-07-23 16:13:25 +00007730 if (PyUnicode_Check(temp))
7731 /* nothing to do */;
7732 else if (PyString_Check(temp)) {
7733 /* convert to string to Unicode */
Thomas Woutersa96affe2006-03-12 00:29:36 +00007734 unicode = PyUnicode_Decode(PyString_AS_STRING(temp),
Guido van Rossumd57fd912000-03-10 22:53:23 +00007735 PyString_GET_SIZE(temp),
Thomas Woutersa96affe2006-03-12 00:29:36 +00007736 NULL,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007737 "strict");
Thomas Woutersa96affe2006-03-12 00:29:36 +00007738 Py_DECREF(temp);
7739 temp = unicode;
7740 if (temp == NULL)
7741 goto onError;
7742 }
Marc-André Lemburgd25c6502004-07-23 16:13:25 +00007743 else {
7744 Py_DECREF(temp);
7745 PyErr_SetString(PyExc_TypeError,
7746 "%s argument has non-string str()");
7747 goto onError;
7748 }
7749 }
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007750 pbuf = PyUnicode_AS_UNICODE(temp);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007751 len = PyUnicode_GET_SIZE(temp);
7752 if (prec >= 0 && len > prec)
7753 len = prec;
7754 break;
7755
7756 case 'i':
7757 case 'd':
7758 case 'u':
7759 case 'o':
7760 case 'x':
7761 case 'X':
7762 if (c == 'i')
7763 c = 'd';
Tim Petersa3a3a032000-11-30 05:22:44 +00007764 if (PyLong_Check(v)) {
Tim Peters38fd5b62000-09-21 05:43:11 +00007765 temp = formatlong(v, flags, prec, c);
7766 if (!temp)
7767 goto onError;
7768 pbuf = PyUnicode_AS_UNICODE(temp);
7769 len = PyUnicode_GET_SIZE(temp);
Tim Peters38fd5b62000-09-21 05:43:11 +00007770 sign = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007771 }
Tim Peters38fd5b62000-09-21 05:43:11 +00007772 else {
7773 pbuf = formatbuf;
7774 len = formatint(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
7775 flags, prec, c, v);
7776 if (len < 0)
7777 goto onError;
Guido van Rossum6c9e1302003-11-29 23:52:13 +00007778 sign = 1;
Tim Peters38fd5b62000-09-21 05:43:11 +00007779 }
7780 if (flags & F_ZERO)
7781 fill = '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00007782 break;
7783
7784 case 'e':
7785 case 'E':
7786 case 'f':
Raymond Hettinger9bfe5332003-08-27 04:55:52 +00007787 case 'F':
Guido van Rossumd57fd912000-03-10 22:53:23 +00007788 case 'g':
7789 case 'G':
Raymond Hettinger9bfe5332003-08-27 04:55:52 +00007790 if (c == 'F')
7791 c = 'f';
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007792 pbuf = formatbuf;
7793 len = formatfloat(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
7794 flags, prec, c, v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007795 if (len < 0)
7796 goto onError;
7797 sign = 1;
Tim Peters38fd5b62000-09-21 05:43:11 +00007798 if (flags & F_ZERO)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007799 fill = '0';
7800 break;
7801
7802 case 'c':
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007803 pbuf = formatbuf;
7804 len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007805 if (len < 0)
7806 goto onError;
7807 break;
7808
7809 default:
7810 PyErr_Format(PyExc_ValueError,
Andrew M. Kuchling6ca89172000-12-15 13:07:46 +00007811 "unsupported format character '%c' (0x%x) "
Thomas Wouters89f507f2006-12-13 04:49:30 +00007812 "at index %zd",
Tim Petersced69f82003-09-16 20:30:58 +00007813 (31<=c && c<=126) ? (char)c : '?',
Marc-André Lemburg24e53b62002-09-24 09:32:14 +00007814 (int)c,
Thomas Wouters89f507f2006-12-13 04:49:30 +00007815 (Py_ssize_t)(fmt - 1 -
7816 PyUnicode_AS_UNICODE(uformat)));
Guido van Rossumd57fd912000-03-10 22:53:23 +00007817 goto onError;
7818 }
7819 if (sign) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007820 if (*pbuf == '-' || *pbuf == '+') {
7821 sign = *pbuf++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007822 len--;
7823 }
7824 else if (flags & F_SIGN)
7825 sign = '+';
7826 else if (flags & F_BLANK)
7827 sign = ' ';
7828 else
7829 sign = 0;
7830 }
7831 if (width < len)
7832 width = len;
Guido van Rossum049cd6b2002-10-11 00:43:48 +00007833 if (rescnt - (sign != 0) < width) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007834 reslen -= rescnt;
7835 rescnt = width + fmtcnt + 100;
7836 reslen += rescnt;
Guido van Rossum049cd6b2002-10-11 00:43:48 +00007837 if (reslen < 0) {
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00007838 Py_XDECREF(temp);
Thomas Woutersa96affe2006-03-12 00:29:36 +00007839 PyErr_NoMemory();
7840 goto onError;
Guido van Rossum049cd6b2002-10-11 00:43:48 +00007841 }
Thomas Woutersa96affe2006-03-12 00:29:36 +00007842 if (_PyUnicode_Resize(&result, reslen) < 0) {
7843 Py_XDECREF(temp);
7844 goto onError;
7845 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007846 res = PyUnicode_AS_UNICODE(result)
7847 + reslen - rescnt;
7848 }
7849 if (sign) {
7850 if (fill != ' ')
7851 *res++ = sign;
7852 rescnt--;
7853 if (width > len)
7854 width--;
7855 }
Tim Peters38fd5b62000-09-21 05:43:11 +00007856 if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
7857 assert(pbuf[0] == '0');
Tim Petersfff53252001-04-12 18:38:48 +00007858 assert(pbuf[1] == c);
7859 if (fill != ' ') {
7860 *res++ = *pbuf++;
7861 *res++ = *pbuf++;
Tim Peters38fd5b62000-09-21 05:43:11 +00007862 }
Tim Petersfff53252001-04-12 18:38:48 +00007863 rescnt -= 2;
7864 width -= 2;
7865 if (width < 0)
7866 width = 0;
7867 len -= 2;
Tim Peters38fd5b62000-09-21 05:43:11 +00007868 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007869 if (width > len && !(flags & F_LJUST)) {
7870 do {
7871 --rescnt;
7872 *res++ = fill;
7873 } while (--width > len);
7874 }
Tim Peters38fd5b62000-09-21 05:43:11 +00007875 if (fill == ' ') {
7876 if (sign)
7877 *res++ = sign;
Tim Petersfff53252001-04-12 18:38:48 +00007878 if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
Tim Peters38fd5b62000-09-21 05:43:11 +00007879 assert(pbuf[0] == '0');
Tim Petersfff53252001-04-12 18:38:48 +00007880 assert(pbuf[1] == c);
Tim Peters38fd5b62000-09-21 05:43:11 +00007881 *res++ = *pbuf++;
7882 *res++ = *pbuf++;
7883 }
7884 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00007885 Py_UNICODE_COPY(res, pbuf, len);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007886 res += len;
7887 rescnt -= len;
7888 while (--width >= len) {
7889 --rescnt;
7890 *res++ = ' ';
7891 }
7892 if (dict && (argidx < arglen) && c != '%') {
7893 PyErr_SetString(PyExc_TypeError,
Raymond Hettinger0ebac972002-05-21 15:14:57 +00007894 "not all arguments converted during string formatting");
Thomas Woutersa96affe2006-03-12 00:29:36 +00007895 Py_XDECREF(temp);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007896 goto onError;
7897 }
7898 Py_XDECREF(temp);
7899 } /* '%' */
7900 } /* until end */
7901 if (argidx < arglen && !dict) {
7902 PyErr_SetString(PyExc_TypeError,
Raymond Hettinger0ebac972002-05-21 15:14:57 +00007903 "not all arguments converted during string formatting");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007904 goto onError;
7905 }
7906
Thomas Woutersa96affe2006-03-12 00:29:36 +00007907 if (_PyUnicode_Resize(&result, reslen - rescnt) < 0)
7908 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007909 if (args_owned) {
7910 Py_DECREF(args);
7911 }
7912 Py_DECREF(uformat);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007913 return (PyObject *)result;
7914
7915 onError:
7916 Py_XDECREF(result);
7917 Py_DECREF(uformat);
7918 if (args_owned) {
7919 Py_DECREF(args);
7920 }
7921 return NULL;
7922}
7923
7924static PyBufferProcs unicode_as_buffer = {
Martin v. Löwis18e16552006-02-15 17:27:45 +00007925 (readbufferproc) unicode_buffer_getreadbuf,
7926 (writebufferproc) unicode_buffer_getwritebuf,
7927 (segcountproc) unicode_buffer_getsegcount,
7928 (charbufferproc) unicode_buffer_getcharbuf,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007929};
7930
Jeremy Hylton938ace62002-07-17 16:30:39 +00007931static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +00007932unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
7933
Tim Peters6d6c1a32001-08-02 04:15:00 +00007934static PyObject *
7935unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
7936{
7937 PyObject *x = NULL;
Martin v. Löwis15e62742006-02-27 16:46:16 +00007938 static char *kwlist[] = {"string", "encoding", "errors", 0};
Tim Peters6d6c1a32001-08-02 04:15:00 +00007939 char *encoding = NULL;
7940 char *errors = NULL;
7941
Guido van Rossume023fe02001-08-30 03:12:59 +00007942 if (type != &PyUnicode_Type)
7943 return unicode_subtype_new(type, args, kwds);
Tim Peters6d6c1a32001-08-02 04:15:00 +00007944 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:unicode",
7945 kwlist, &x, &encoding, &errors))
7946 return NULL;
7947 if (x == NULL)
7948 return (PyObject *)_PyUnicode_New(0);
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00007949 if (encoding == NULL && errors == NULL)
7950 return PyObject_Unicode(x);
7951 else
Tim Peters6d6c1a32001-08-02 04:15:00 +00007952 return PyUnicode_FromEncodedObject(x, encoding, errors);
7953}
7954
Guido van Rossume023fe02001-08-30 03:12:59 +00007955static PyObject *
7956unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
7957{
Tim Petersaf90b3e2001-09-12 05:18:58 +00007958 PyUnicodeObject *tmp, *pnew;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007959 Py_ssize_t n;
Guido van Rossume023fe02001-08-30 03:12:59 +00007960
7961 assert(PyType_IsSubtype(type, &PyUnicode_Type));
7962 tmp = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds);
7963 if (tmp == NULL)
7964 return NULL;
7965 assert(PyUnicode_Check(tmp));
Tim Petersaf90b3e2001-09-12 05:18:58 +00007966 pnew = (PyUnicodeObject *) type->tp_alloc(type, n = tmp->length);
Raymond Hettingerf4667932003-06-28 20:04:25 +00007967 if (pnew == NULL) {
7968 Py_DECREF(tmp);
Guido van Rossume023fe02001-08-30 03:12:59 +00007969 return NULL;
Raymond Hettingerf4667932003-06-28 20:04:25 +00007970 }
Tim Petersaf90b3e2001-09-12 05:18:58 +00007971 pnew->str = PyMem_NEW(Py_UNICODE, n+1);
7972 if (pnew->str == NULL) {
7973 _Py_ForgetReference((PyObject *)pnew);
Neil Schemenauer58aa8612002-04-12 03:07:20 +00007974 PyObject_Del(pnew);
Raymond Hettingerf4667932003-06-28 20:04:25 +00007975 Py_DECREF(tmp);
Neal Norwitzec74f2f2003-02-11 23:05:40 +00007976 return PyErr_NoMemory();
Guido van Rossume023fe02001-08-30 03:12:59 +00007977 }
Tim Petersaf90b3e2001-09-12 05:18:58 +00007978 Py_UNICODE_COPY(pnew->str, tmp->str, n+1);
7979 pnew->length = n;
7980 pnew->hash = tmp->hash;
Guido van Rossume023fe02001-08-30 03:12:59 +00007981 Py_DECREF(tmp);
Tim Petersaf90b3e2001-09-12 05:18:58 +00007982 return (PyObject *)pnew;
Guido van Rossume023fe02001-08-30 03:12:59 +00007983}
7984
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007985PyDoc_STRVAR(unicode_doc,
Tim Peters6d6c1a32001-08-02 04:15:00 +00007986"unicode(string [, encoding[, errors]]) -> object\n\
7987\n\
7988Create a new Unicode object from the given encoded string.\n\
Skip Montanaro35b37a52002-07-26 16:22:46 +00007989encoding defaults to the current default string encoding.\n\
7990errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +00007991
Guido van Rossum50e9fb92006-08-17 05:42:55 +00007992static PyObject *unicode_iter(PyObject *seq);
7993
Guido van Rossumd57fd912000-03-10 22:53:23 +00007994PyTypeObject PyUnicode_Type = {
7995 PyObject_HEAD_INIT(&PyType_Type)
7996 0, /* ob_size */
Guido van Rossum84fc66d2007-05-03 17:18:26 +00007997 "str", /* tp_name */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007998 sizeof(PyUnicodeObject), /* tp_size */
7999 0, /* tp_itemsize */
8000 /* Slots */
Guido van Rossum9475a232001-10-05 20:51:39 +00008001 (destructor)unicode_dealloc, /* tp_dealloc */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008002 0, /* tp_print */
Tim Peters6d6c1a32001-08-02 04:15:00 +00008003 0, /* tp_getattr */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008004 0, /* tp_setattr */
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00008005 0, /* tp_compare */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008006 unicode_repr, /* tp_repr */
Neil Schemenauerce30bc92002-11-18 16:10:18 +00008007 &unicode_as_number, /* tp_as_number */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008008 &unicode_as_sequence, /* tp_as_sequence */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008009 &unicode_as_mapping, /* tp_as_mapping */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008010 (hashfunc) unicode_hash, /* tp_hash*/
8011 0, /* tp_call*/
8012 (reprfunc) unicode_str, /* tp_str */
Tim Peters6d6c1a32001-08-02 04:15:00 +00008013 PyObject_GenericGetAttr, /* tp_getattro */
8014 0, /* tp_setattro */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008015 &unicode_as_buffer, /* tp_as_buffer */
Thomas Wouters27d517b2007-02-25 20:39:11 +00008016 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
8017 Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
Tim Peters6d6c1a32001-08-02 04:15:00 +00008018 unicode_doc, /* tp_doc */
8019 0, /* tp_traverse */
8020 0, /* tp_clear */
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00008021 PyUnicode_RichCompare, /* tp_richcompare */
Tim Peters6d6c1a32001-08-02 04:15:00 +00008022 0, /* tp_weaklistoffset */
Guido van Rossum50e9fb92006-08-17 05:42:55 +00008023 unicode_iter, /* tp_iter */
Tim Peters6d6c1a32001-08-02 04:15:00 +00008024 0, /* tp_iternext */
8025 unicode_methods, /* tp_methods */
8026 0, /* tp_members */
8027 0, /* tp_getset */
Guido van Rossumcacfc072002-05-24 19:01:59 +00008028 &PyBaseString_Type, /* tp_base */
Tim Peters6d6c1a32001-08-02 04:15:00 +00008029 0, /* tp_dict */
8030 0, /* tp_descr_get */
8031 0, /* tp_descr_set */
8032 0, /* tp_dictoffset */
8033 0, /* tp_init */
8034 0, /* tp_alloc */
8035 unicode_new, /* tp_new */
Neil Schemenauer58aa8612002-04-12 03:07:20 +00008036 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008037};
8038
8039/* Initialize the Unicode implementation */
8040
Thomas Wouters78890102000-07-22 19:25:51 +00008041void _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008042{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008043 int i;
8044
Thomas Wouters477c8d52006-05-27 19:21:47 +00008045 /* XXX - move this array to unicodectype.c ? */
8046 Py_UNICODE linebreak[] = {
8047 0x000A, /* LINE FEED */
8048 0x000D, /* CARRIAGE RETURN */
8049 0x001C, /* FILE SEPARATOR */
8050 0x001D, /* GROUP SEPARATOR */
8051 0x001E, /* RECORD SEPARATOR */
8052 0x0085, /* NEXT LINE */
8053 0x2028, /* LINE SEPARATOR */
8054 0x2029, /* PARAGRAPH SEPARATOR */
8055 };
8056
Fred Drakee4315f52000-05-09 19:53:39 +00008057 /* Init the implementation */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008058 unicode_freelist = NULL;
8059 unicode_freelist_size = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008060 unicode_empty = _PyUnicode_New(0);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008061 if (!unicode_empty)
8062 return;
8063
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008064 for (i = 0; i < 256; i++)
8065 unicode_latin1[i] = NULL;
Guido van Rossumcacfc072002-05-24 19:01:59 +00008066 if (PyType_Ready(&PyUnicode_Type) < 0)
8067 Py_FatalError("Can't initialize 'unicode'");
Thomas Wouters477c8d52006-05-27 19:21:47 +00008068
8069 /* initialize the linebreak bloom filter */
8070 bloom_linebreak = make_bloom_mask(
8071 linebreak, sizeof(linebreak) / sizeof(linebreak[0])
8072 );
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008073
8074 PyType_Ready(&EncodingMapType);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008075}
8076
8077/* Finalize the Unicode implementation */
8078
8079void
Thomas Wouters78890102000-07-22 19:25:51 +00008080_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008081{
Barry Warsaw5b4c2282000-10-03 20:45:26 +00008082 PyUnicodeObject *u;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008083 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008084
Guido van Rossum4ae8ef82000-10-03 18:09:04 +00008085 Py_XDECREF(unicode_empty);
8086 unicode_empty = NULL;
Barry Warsaw5b4c2282000-10-03 20:45:26 +00008087
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008088 for (i = 0; i < 256; i++) {
8089 if (unicode_latin1[i]) {
8090 Py_DECREF(unicode_latin1[i]);
8091 unicode_latin1[i] = NULL;
8092 }
8093 }
8094
Barry Warsaw5b4c2282000-10-03 20:45:26 +00008095 for (u = unicode_freelist; u != NULL;) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00008096 PyUnicodeObject *v = u;
8097 u = *(PyUnicodeObject **)u;
Guido van Rossumfd4b9572000-04-10 13:51:10 +00008098 if (v->str)
Guido van Rossumb18618d2000-05-03 23:44:39 +00008099 PyMem_DEL(v->str);
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00008100 Py_XDECREF(v->defenc);
Neil Schemenauer58aa8612002-04-12 03:07:20 +00008101 PyObject_Del(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008102 }
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008103 unicode_freelist = NULL;
8104 unicode_freelist_size = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008105}
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00008106
Guido van Rossum50e9fb92006-08-17 05:42:55 +00008107
8108
8109/********************* Unicode Iterator **************************/
8110
8111typedef struct {
8112 PyObject_HEAD
Guido van Rossum49d6b072006-08-17 21:11:47 +00008113 Py_ssize_t it_index;
Guido van Rossum50e9fb92006-08-17 05:42:55 +00008114 PyUnicodeObject *it_seq; /* Set to NULL when iterator is exhausted */
8115} unicodeiterobject;
8116
8117static void
8118unicodeiter_dealloc(unicodeiterobject *it)
8119{
8120 _PyObject_GC_UNTRACK(it);
8121 Py_XDECREF(it->it_seq);
8122 PyObject_GC_Del(it);
8123}
8124
8125static int
8126unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
8127{
8128 Py_VISIT(it->it_seq);
8129 return 0;
8130}
8131
8132static PyObject *
8133unicodeiter_next(unicodeiterobject *it)
8134{
8135 PyUnicodeObject *seq;
8136 PyObject *item;
8137
8138 assert(it != NULL);
8139 seq = it->it_seq;
8140 if (seq == NULL)
8141 return NULL;
8142 assert(PyUnicode_Check(seq));
8143
8144 if (it->it_index < PyUnicode_GET_SIZE(seq)) {
Guido van Rossum49d6b072006-08-17 21:11:47 +00008145 item = PyUnicode_FromUnicode(
8146 PyUnicode_AS_UNICODE(seq)+it->it_index, 1);
Guido van Rossum50e9fb92006-08-17 05:42:55 +00008147 if (item != NULL)
8148 ++it->it_index;
8149 return item;
8150 }
8151
8152 Py_DECREF(seq);
8153 it->it_seq = NULL;
8154 return NULL;
8155}
8156
8157static PyObject *
8158unicodeiter_len(unicodeiterobject *it)
8159{
8160 Py_ssize_t len = 0;
8161 if (it->it_seq)
8162 len = PyUnicode_GET_SIZE(it->it_seq) - it->it_index;
8163 return PyInt_FromSsize_t(len);
8164}
8165
8166PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
8167
8168static PyMethodDef unicodeiter_methods[] = {
Guido van Rossum49d6b072006-08-17 21:11:47 +00008169 {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
8170 length_hint_doc},
Guido van Rossum50e9fb92006-08-17 05:42:55 +00008171 {NULL, NULL} /* sentinel */
8172};
8173
8174PyTypeObject PyUnicodeIter_Type = {
8175 PyObject_HEAD_INIT(&PyType_Type)
8176 0, /* ob_size */
8177 "unicodeiterator", /* tp_name */
8178 sizeof(unicodeiterobject), /* tp_basicsize */
8179 0, /* tp_itemsize */
8180 /* methods */
Guido van Rossum49d6b072006-08-17 21:11:47 +00008181 (destructor)unicodeiter_dealloc, /* tp_dealloc */
Guido van Rossum50e9fb92006-08-17 05:42:55 +00008182 0, /* tp_print */
8183 0, /* tp_getattr */
8184 0, /* tp_setattr */
8185 0, /* tp_compare */
8186 0, /* tp_repr */
8187 0, /* tp_as_number */
8188 0, /* tp_as_sequence */
8189 0, /* tp_as_mapping */
8190 0, /* tp_hash */
8191 0, /* tp_call */
8192 0, /* tp_str */
8193 PyObject_GenericGetAttr, /* tp_getattro */
8194 0, /* tp_setattro */
8195 0, /* tp_as_buffer */
8196 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
8197 0, /* tp_doc */
8198 (traverseproc)unicodeiter_traverse, /* tp_traverse */
8199 0, /* tp_clear */
8200 0, /* tp_richcompare */
8201 0, /* tp_weaklistoffset */
8202 PyObject_SelfIter, /* tp_iter */
8203 (iternextfunc)unicodeiter_next, /* tp_iternext */
8204 unicodeiter_methods, /* tp_methods */
8205 0,
8206};
8207
8208static PyObject *
8209unicode_iter(PyObject *seq)
8210{
8211 unicodeiterobject *it;
8212
8213 if (!PyUnicode_Check(seq)) {
8214 PyErr_BadInternalCall();
8215 return NULL;
8216 }
8217 it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
8218 if (it == NULL)
8219 return NULL;
8220 it->it_index = 0;
8221 Py_INCREF(seq);
8222 it->it_seq = (PyUnicodeObject *)seq;
8223 _PyObject_GC_TRACK(it);
8224 return (PyObject *)it;
8225}
8226
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008227#ifdef __cplusplus
8228}
8229#endif
8230
8231
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00008232/*
8233Local variables:
8234c-basic-offset: 4
8235indent-tabs-mode: nil
8236End:
8237*/