blob: d4a17ce43bb44e4f4dc19ff60dcb7d94dcaa69dc [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Fred Drake785d14f2000-05-09 19:54:43 +00004modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
Guido van Rossumd57fd912000-03-10 22:53:23 +00005Unicode Integration Proposal (see file Misc/unicode.txt).
6
Thomas Wouters477c8d52006-05-27 19:21:47 +00007Major speed upgrades to the method implementations at the Reykjavik
8NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
9
Guido van Rossum16b1ad92000-08-03 16:24:25 +000010Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000011
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000012--------------------------------------------------------------------
13The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000014
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000015 Copyright (c) 1999 by Secret Labs AB
16 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000017
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000018By obtaining, using, and/or copying this software and/or its
19associated documentation, you agree that you have read, understood,
20and will comply with the following terms and conditions:
21
22Permission to use, copy, modify, and distribute this software and its
23associated documentation for any purpose and without fee is hereby
24granted, provided that the above copyright notice appears in all
25copies, and that both that copyright notice and this permission notice
26appear in supporting documentation, and that the name of Secret Labs
27AB or the author not be used in advertising or publicity pertaining to
28distribution of the software without specific, written prior
29permission.
30
31SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
32THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
33FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
34ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
35WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
36ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
37OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
38--------------------------------------------------------------------
39
40*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000041
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000042#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000043#include "Python.h"
44
Guido van Rossumd57fd912000-03-10 22:53:23 +000045#include "unicodeobject.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000046#include "ucnhash.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000047
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000048#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000049#include <windows.h>
50#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000051
Guido van Rossumd57fd912000-03-10 22:53:23 +000052/* Limit for the Unicode object free list */
53
54#define MAX_UNICODE_FREELIST_SIZE 1024
55
56/* Limit for the Unicode object free list stay alive optimization.
57
58 The implementation will keep allocated Unicode memory intact for
59 all objects on the free list having a size less than this
Tim Petersced69f82003-09-16 20:30:58 +000060 limit. This reduces malloc() overhead for small Unicode objects.
Guido van Rossumd57fd912000-03-10 22:53:23 +000061
Barry Warsaw51ac5802000-03-20 16:36:48 +000062 At worst this will result in MAX_UNICODE_FREELIST_SIZE *
Guido van Rossumfd4b9572000-04-10 13:51:10 +000063 (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
Guido van Rossumd57fd912000-03-10 22:53:23 +000064 malloc()-overhead) bytes of unused garbage.
65
66 Setting the limit to 0 effectively turns the feature off.
67
Guido van Rossumfd4b9572000-04-10 13:51:10 +000068 Note: This is an experimental feature ! If you get core dumps when
69 using Unicode objects, turn this feature off.
Guido van Rossumd57fd912000-03-10 22:53:23 +000070
71*/
72
Guido van Rossumfd4b9572000-04-10 13:51:10 +000073#define KEEPALIVE_SIZE_LIMIT 9
Guido van Rossumd57fd912000-03-10 22:53:23 +000074
75/* Endianness switches; defaults to little endian */
76
77#ifdef WORDS_BIGENDIAN
78# define BYTEORDER_IS_BIG_ENDIAN
79#else
80# define BYTEORDER_IS_LITTLE_ENDIAN
81#endif
82
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000083/* --- Globals ------------------------------------------------------------
84
85 The globals are initialized by the _PyUnicode_Init() API and should
86 not be used before calling that API.
87
88*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000089
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000090
91#ifdef __cplusplus
92extern "C" {
93#endif
94
Guido van Rossumd57fd912000-03-10 22:53:23 +000095/* Free list for Unicode objects */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000096static PyUnicodeObject *unicode_freelist;
97static int unicode_freelist_size;
Guido van Rossumd57fd912000-03-10 22:53:23 +000098
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000099/* The empty Unicode object is shared to improve performance. */
100static PyUnicodeObject *unicode_empty;
101
102/* Single character Unicode strings in the Latin-1 range are being
103 shared as well. */
104static PyUnicodeObject *unicode_latin1[256];
105
Fred Drakee4315f52000-05-09 19:53:39 +0000106/* Default encoding to use and assume when NULL is passed as encoding
Guido van Rossumf15a29f2007-05-04 00:41:39 +0000107 parameter; it is fixed to "utf-8". Always use the
108 PyUnicode_GetDefaultEncoding() API to access this global. */
109static const char unicode_default_encoding[] = "utf-8";
Fred Drakee4315f52000-05-09 19:53:39 +0000110
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000111Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000112PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000113{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000114#ifdef Py_UNICODE_WIDE
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000115 return 0x10FFFF;
116#else
117 /* This is actually an illegal character, so it should
118 not be passed to unichr. */
119 return 0xFFFF;
120#endif
121}
122
Thomas Wouters477c8d52006-05-27 19:21:47 +0000123/* --- Bloom Filters ----------------------------------------------------- */
124
125/* stuff to implement simple "bloom filters" for Unicode characters.
126 to keep things simple, we use a single bitmask, using the least 5
127 bits from each unicode characters as the bit index. */
128
129/* the linebreak mask is set up by Unicode_Init below */
130
131#define BLOOM_MASK unsigned long
132
133static BLOOM_MASK bloom_linebreak;
134
135#define BLOOM(mask, ch) ((mask & (1 << ((ch) & 0x1F))))
136
137#define BLOOM_LINEBREAK(ch)\
138 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK((ch)))
139
140Py_LOCAL_INLINE(BLOOM_MASK) make_bloom_mask(Py_UNICODE* ptr, Py_ssize_t len)
141{
142 /* calculate simple bloom-style bitmask for a given unicode string */
143
144 long mask;
145 Py_ssize_t i;
146
147 mask = 0;
148 for (i = 0; i < len; i++)
149 mask |= (1 << (ptr[i] & 0x1F));
150
151 return mask;
152}
153
154Py_LOCAL_INLINE(int) unicode_member(Py_UNICODE chr, Py_UNICODE* set, Py_ssize_t setlen)
155{
156 Py_ssize_t i;
157
158 for (i = 0; i < setlen; i++)
159 if (set[i] == chr)
160 return 1;
161
162 return 0;
163}
164
165#define BLOOM_MEMBER(mask, chr, set, setlen)\
166 BLOOM(mask, chr) && unicode_member(chr, set, setlen)
167
Guido van Rossumd57fd912000-03-10 22:53:23 +0000168/* --- Unicode Object ----------------------------------------------------- */
169
170static
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000171int unicode_resize(register PyUnicodeObject *unicode,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000172 Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000173{
174 void *oldstr;
Tim Petersced69f82003-09-16 20:30:58 +0000175
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000176 /* Shortcut if there's nothing much to do. */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000177 if (unicode->length == length)
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000178 goto reset;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000179
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000180 /* Resizing shared object (unicode_empty or single character
181 objects) in-place is not allowed. Use PyUnicode_Resize()
182 instead ! */
Thomas Wouters477c8d52006-05-27 19:21:47 +0000183
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000184 if (unicode == unicode_empty ||
185 (unicode->length == 1 &&
186 unicode->str[0] < 256U &&
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000187 unicode_latin1[unicode->str[0]] == unicode)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000188 PyErr_SetString(PyExc_SystemError,
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000189 "can't resize shared unicode objects");
Guido van Rossumd57fd912000-03-10 22:53:23 +0000190 return -1;
191 }
192
Thomas Wouters477c8d52006-05-27 19:21:47 +0000193 /* We allocate one more byte to make sure the string is Ux0000 terminated.
194 The overallocation is also used by fastsearch, which assumes that it's
195 safe to look at str[length] (without making any assumptions about what
196 it contains). */
197
Guido van Rossumd57fd912000-03-10 22:53:23 +0000198 oldstr = unicode->str;
199 PyMem_RESIZE(unicode->str, Py_UNICODE, length + 1);
200 if (!unicode->str) {
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000201 unicode->str = (Py_UNICODE *)oldstr;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000202 PyErr_NoMemory();
203 return -1;
204 }
205 unicode->str[length] = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000206 unicode->length = length;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000207
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000208 reset:
Guido van Rossumd57fd912000-03-10 22:53:23 +0000209 /* Reset the object caches */
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000210 if (unicode->defenc) {
211 Py_DECREF(unicode->defenc);
212 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000213 }
214 unicode->hash = -1;
Tim Petersced69f82003-09-16 20:30:58 +0000215
Guido van Rossumd57fd912000-03-10 22:53:23 +0000216 return 0;
217}
218
219/* We allocate one more byte to make sure the string is
Tim Petersced69f82003-09-16 20:30:58 +0000220 Ux0000 terminated -- XXX is this needed ?
Guido van Rossumd57fd912000-03-10 22:53:23 +0000221
222 XXX This allocator could further be enhanced by assuring that the
223 free list never reduces its size below 1.
224
225*/
226
227static
Martin v. Löwis18e16552006-02-15 17:27:45 +0000228PyUnicodeObject *_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000229{
230 register PyUnicodeObject *unicode;
231
Thomas Wouters477c8d52006-05-27 19:21:47 +0000232 /* Optimization for empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000233 if (length == 0 && unicode_empty != NULL) {
234 Py_INCREF(unicode_empty);
235 return unicode_empty;
236 }
237
238 /* Unicode freelist & memory allocation */
239 if (unicode_freelist) {
240 unicode = unicode_freelist;
Marc-André Lemburgbea47e72000-06-17 20:31:17 +0000241 unicode_freelist = *(PyUnicodeObject **)unicode;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000242 unicode_freelist_size--;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000243 if (unicode->str) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000244 /* Keep-Alive optimization: we only upsize the buffer,
245 never downsize it. */
Tim Petersced69f82003-09-16 20:30:58 +0000246 if ((unicode->length < length) &&
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +0000247 unicode_resize(unicode, length) < 0) {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000248 PyMem_DEL(unicode->str);
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000249 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000250 }
251 }
Guido van Rossumad98db12001-06-14 17:52:02 +0000252 else {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000253 unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
Guido van Rossumad98db12001-06-14 17:52:02 +0000254 }
255 PyObject_INIT(unicode, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000256 }
257 else {
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000258 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000259 if (unicode == NULL)
260 return NULL;
261 unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
262 }
263
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000264 if (!unicode->str) {
265 PyErr_NoMemory();
Barry Warsaw51ac5802000-03-20 16:36:48 +0000266 goto onError;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000267 }
Jeremy Hyltond8082792003-09-16 19:41:39 +0000268 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +0000269 * the caller fails before initializing str -- unicode_resize()
270 * reads str[0], and the Keep-Alive optimization can keep memory
271 * allocated for str alive across a call to unicode_dealloc(unicode).
272 * We don't want unicode_resize to read uninitialized memory in
273 * that case.
274 */
Jeremy Hyltond8082792003-09-16 19:41:39 +0000275 unicode->str[0] = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000276 unicode->str[length] = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000277 unicode->length = length;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000278 unicode->hash = -1;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000279 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000280 return unicode;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000281
282 onError:
283 _Py_ForgetReference((PyObject *)unicode);
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000284 PyObject_Del(unicode);
Barry Warsaw51ac5802000-03-20 16:36:48 +0000285 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000286}
287
288static
Guido van Rossum9475a232001-10-05 20:51:39 +0000289void unicode_dealloc(register PyUnicodeObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000290{
Guido van Rossum604ddf82001-12-06 20:03:56 +0000291 if (PyUnicode_CheckExact(unicode) &&
292 unicode_freelist_size < MAX_UNICODE_FREELIST_SIZE) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000293 /* Keep-Alive optimization */
294 if (unicode->length >= KEEPALIVE_SIZE_LIMIT) {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000295 PyMem_DEL(unicode->str);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000296 unicode->str = NULL;
297 unicode->length = 0;
298 }
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000299 if (unicode->defenc) {
300 Py_DECREF(unicode->defenc);
301 unicode->defenc = NULL;
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000302 }
303 /* Add to free list */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000304 *(PyUnicodeObject **)unicode = unicode_freelist;
305 unicode_freelist = unicode;
306 unicode_freelist_size++;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000307 }
308 else {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000309 PyMem_DEL(unicode->str);
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000310 Py_XDECREF(unicode->defenc);
Guido van Rossum604ddf82001-12-06 20:03:56 +0000311 unicode->ob_type->tp_free((PyObject *)unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000312 }
313}
314
Martin v. Löwis18e16552006-02-15 17:27:45 +0000315int PyUnicode_Resize(PyObject **unicode, Py_ssize_t length)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000316{
317 register PyUnicodeObject *v;
318
319 /* Argument checks */
320 if (unicode == NULL) {
321 PyErr_BadInternalCall();
322 return -1;
323 }
324 v = (PyUnicodeObject *)*unicode;
Guido van Rossum049cd6b2002-10-11 00:43:48 +0000325 if (v == NULL || !PyUnicode_Check(v) || v->ob_refcnt != 1 || length < 0) {
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000326 PyErr_BadInternalCall();
327 return -1;
328 }
329
330 /* Resizing unicode_empty and single character objects is not
331 possible since these are being shared. We simply return a fresh
332 copy with the same Unicode content. */
Tim Petersced69f82003-09-16 20:30:58 +0000333 if (v->length != length &&
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000334 (v == unicode_empty || v->length == 1)) {
335 PyUnicodeObject *w = _PyUnicode_New(length);
336 if (w == NULL)
337 return -1;
338 Py_UNICODE_COPY(w->str, v->str,
339 length < v->length ? length : v->length);
Raymond Hettingerc8df5782003-03-09 07:30:43 +0000340 Py_DECREF(*unicode);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000341 *unicode = (PyObject *)w;
342 return 0;
343 }
344
345 /* Note that we don't have to modify *unicode for unshared Unicode
346 objects, since we can modify them in-place. */
347 return unicode_resize(v, length);
348}
349
350/* Internal API for use in unicodeobject.c only ! */
351#define _PyUnicode_Resize(unicodevar, length) \
352 PyUnicode_Resize(((PyObject **)(unicodevar)), length)
353
Guido van Rossumd57fd912000-03-10 22:53:23 +0000354PyObject *PyUnicode_FromUnicode(const Py_UNICODE *u,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000355 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000356{
357 PyUnicodeObject *unicode;
358
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000359 /* If the Unicode data is known at construction time, we can apply
360 some optimizations which share commonly used objects. */
361 if (u != NULL) {
362
363 /* Optimization for empty strings */
364 if (size == 0 && unicode_empty != NULL) {
365 Py_INCREF(unicode_empty);
366 return (PyObject *)unicode_empty;
367 }
368
369 /* Single character Unicode objects in the Latin-1 range are
370 shared when using this constructor */
371 if (size == 1 && *u < 256) {
372 unicode = unicode_latin1[*u];
373 if (!unicode) {
374 unicode = _PyUnicode_New(1);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000375 if (!unicode)
376 return NULL;
Marc-André Lemburg8879a332001-06-07 12:26:56 +0000377 unicode->str[0] = *u;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000378 unicode_latin1[*u] = unicode;
379 }
380 Py_INCREF(unicode);
381 return (PyObject *)unicode;
382 }
383 }
Tim Petersced69f82003-09-16 20:30:58 +0000384
Guido van Rossumd57fd912000-03-10 22:53:23 +0000385 unicode = _PyUnicode_New(size);
386 if (!unicode)
387 return NULL;
388
389 /* Copy the Unicode data into the new object */
390 if (u != NULL)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000391 Py_UNICODE_COPY(unicode->str, u, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000392
393 return (PyObject *)unicode;
394}
395
396#ifdef HAVE_WCHAR_H
397
398PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000399 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000400{
401 PyUnicodeObject *unicode;
402
403 if (w == NULL) {
404 PyErr_BadInternalCall();
405 return NULL;
406 }
407
408 unicode = _PyUnicode_New(size);
409 if (!unicode)
410 return NULL;
411
412 /* Copy the wchar_t data into the new object */
413#ifdef HAVE_USABLE_WCHAR_T
414 memcpy(unicode->str, w, size * sizeof(wchar_t));
Tim Petersced69f82003-09-16 20:30:58 +0000415#else
Guido van Rossumd57fd912000-03-10 22:53:23 +0000416 {
417 register Py_UNICODE *u;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000418 register Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000419 u = PyUnicode_AS_UNICODE(unicode);
Marc-André Lemburg204bd6d2004-10-15 07:45:05 +0000420 for (i = size; i > 0; i--)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000421 *u++ = *w++;
422 }
423#endif
424
425 return (PyObject *)unicode;
426}
427
Martin v. Löwis18e16552006-02-15 17:27:45 +0000428Py_ssize_t PyUnicode_AsWideChar(PyUnicodeObject *unicode,
429 wchar_t *w,
430 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000431{
432 if (unicode == NULL) {
433 PyErr_BadInternalCall();
434 return -1;
435 }
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +0000436
437 /* If possible, try to copy the 0-termination as well */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000438 if (size > PyUnicode_GET_SIZE(unicode))
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +0000439 size = PyUnicode_GET_SIZE(unicode) + 1;
440
Guido van Rossumd57fd912000-03-10 22:53:23 +0000441#ifdef HAVE_USABLE_WCHAR_T
442 memcpy(w, unicode->str, size * sizeof(wchar_t));
443#else
444 {
445 register Py_UNICODE *u;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000446 register Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000447 u = PyUnicode_AS_UNICODE(unicode);
Marc-André Lemburg204bd6d2004-10-15 07:45:05 +0000448 for (i = size; i > 0; i--)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000449 *w++ = *u++;
450 }
451#endif
452
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +0000453 if (size > PyUnicode_GET_SIZE(unicode))
454 return PyUnicode_GET_SIZE(unicode);
455 else
Guido van Rossumd57fd912000-03-10 22:53:23 +0000456 return size;
457}
458
459#endif
460
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000461PyObject *PyUnicode_FromOrdinal(int ordinal)
462{
Hye-Shik Chang40574832004-04-06 07:24:51 +0000463 Py_UNICODE s[1];
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000464
465#ifdef Py_UNICODE_WIDE
466 if (ordinal < 0 || ordinal > 0x10ffff) {
467 PyErr_SetString(PyExc_ValueError,
468 "unichr() arg not in range(0x110000) "
469 "(wide Python build)");
470 return NULL;
471 }
472#else
473 if (ordinal < 0 || ordinal > 0xffff) {
474 PyErr_SetString(PyExc_ValueError,
475 "unichr() arg not in range(0x10000) "
476 "(narrow Python build)");
477 return NULL;
478 }
479#endif
480
Hye-Shik Chang40574832004-04-06 07:24:51 +0000481 s[0] = (Py_UNICODE)ordinal;
482 return PyUnicode_FromUnicode(s, 1);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000483}
484
Guido van Rossumd57fd912000-03-10 22:53:23 +0000485PyObject *PyUnicode_FromObject(register PyObject *obj)
486{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000487 /* XXX Perhaps we should make this API an alias of
488 PyObject_Unicode() instead ?! */
489 if (PyUnicode_CheckExact(obj)) {
490 Py_INCREF(obj);
491 return obj;
492 }
493 if (PyUnicode_Check(obj)) {
494 /* For a Unicode subtype that's not a Unicode object,
495 return a true Unicode object with the same data. */
496 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(obj),
497 PyUnicode_GET_SIZE(obj));
498 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000499 return PyUnicode_FromEncodedObject(obj, NULL, "strict");
500}
501
502PyObject *PyUnicode_FromEncodedObject(register PyObject *obj,
503 const char *encoding,
504 const char *errors)
505{
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000506 const char *s = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +0000507 Py_ssize_t len;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000508 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +0000509
Guido van Rossumd57fd912000-03-10 22:53:23 +0000510 if (obj == NULL) {
511 PyErr_BadInternalCall();
512 return NULL;
513 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000514
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000515#if 0
516 /* For b/w compatibility we also accept Unicode objects provided
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +0000517 that no encodings is given and then redirect to
518 PyObject_Unicode() which then applies the additional logic for
519 Unicode subclasses.
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000520
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000521 NOTE: This API should really only be used for object which
522 represent *encoded* Unicode !
523
524 */
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000525 if (PyUnicode_Check(obj)) {
526 if (encoding) {
527 PyErr_SetString(PyExc_TypeError,
528 "decoding Unicode is not supported");
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000529 return NULL;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000530 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000531 return PyObject_Unicode(obj);
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000532 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000533#else
534 if (PyUnicode_Check(obj)) {
535 PyErr_SetString(PyExc_TypeError,
536 "decoding Unicode is not supported");
537 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000538 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000539#endif
540
541 /* Coerce object */
542 if (PyString_Check(obj)) {
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000543 s = PyString_AS_STRING(obj);
544 len = PyString_GET_SIZE(obj);
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000545 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000546 else if (PyObject_AsCharBuffer(obj, &s, &len)) {
547 /* Overwrite the error message with something more useful in
548 case of a TypeError. */
549 if (PyErr_ExceptionMatches(PyExc_TypeError))
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000550 PyErr_Format(PyExc_TypeError,
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000551 "coercing to Unicode: need string or buffer, "
552 "%.80s found",
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000553 obj->ob_type->tp_name);
554 goto onError;
555 }
Tim Petersced69f82003-09-16 20:30:58 +0000556
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000557 /* Convert to Unicode */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000558 if (len == 0) {
559 Py_INCREF(unicode_empty);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000560 v = (PyObject *)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000561 }
Tim Petersced69f82003-09-16 20:30:58 +0000562 else
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000563 v = PyUnicode_Decode(s, len, encoding, errors);
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +0000564
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000565 return v;
566
567 onError:
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000568 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000569}
570
571PyObject *PyUnicode_Decode(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000572 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +0000573 const char *encoding,
574 const char *errors)
575{
576 PyObject *buffer = NULL, *unicode;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000577
578 if (encoding == NULL)
Fred Drakee4315f52000-05-09 19:53:39 +0000579 encoding = PyUnicode_GetDefaultEncoding();
580
581 /* Shortcuts for common default encodings */
582 if (strcmp(encoding, "utf-8") == 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000583 return PyUnicode_DecodeUTF8(s, size, errors);
Fred Drakee4315f52000-05-09 19:53:39 +0000584 else if (strcmp(encoding, "latin-1") == 0)
585 return PyUnicode_DecodeLatin1(s, size, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +0000586#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
587 else if (strcmp(encoding, "mbcs") == 0)
588 return PyUnicode_DecodeMBCS(s, size, errors);
589#endif
Fred Drakee4315f52000-05-09 19:53:39 +0000590 else if (strcmp(encoding, "ascii") == 0)
591 return PyUnicode_DecodeASCII(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000592
593 /* Decode via the codec registry */
594 buffer = PyBuffer_FromMemory((void *)s, size);
595 if (buffer == NULL)
596 goto onError;
597 unicode = PyCodec_Decode(buffer, encoding, errors);
598 if (unicode == NULL)
599 goto onError;
600 if (!PyUnicode_Check(unicode)) {
601 PyErr_Format(PyExc_TypeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000602 "decoder did not return an unicode object (type=%.400s)",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000603 unicode->ob_type->tp_name);
604 Py_DECREF(unicode);
605 goto onError;
606 }
607 Py_DECREF(buffer);
608 return unicode;
Tim Petersced69f82003-09-16 20:30:58 +0000609
Guido van Rossumd57fd912000-03-10 22:53:23 +0000610 onError:
611 Py_XDECREF(buffer);
612 return NULL;
613}
614
Marc-André Lemburgd2d45982004-07-08 17:57:32 +0000615PyObject *PyUnicode_AsDecodedObject(PyObject *unicode,
616 const char *encoding,
617 const char *errors)
618{
619 PyObject *v;
620
621 if (!PyUnicode_Check(unicode)) {
622 PyErr_BadArgument();
623 goto onError;
624 }
625
626 if (encoding == NULL)
627 encoding = PyUnicode_GetDefaultEncoding();
628
629 /* Decode via the codec registry */
630 v = PyCodec_Decode(unicode, encoding, errors);
631 if (v == NULL)
632 goto onError;
633 return v;
634
635 onError:
636 return NULL;
637}
638
Guido van Rossumd57fd912000-03-10 22:53:23 +0000639PyObject *PyUnicode_Encode(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000640 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +0000641 const char *encoding,
642 const char *errors)
643{
644 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +0000645
Guido van Rossumd57fd912000-03-10 22:53:23 +0000646 unicode = PyUnicode_FromUnicode(s, size);
647 if (unicode == NULL)
648 return NULL;
649 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
650 Py_DECREF(unicode);
651 return v;
652}
653
Marc-André Lemburgd2d45982004-07-08 17:57:32 +0000654PyObject *PyUnicode_AsEncodedObject(PyObject *unicode,
655 const char *encoding,
656 const char *errors)
657{
658 PyObject *v;
659
660 if (!PyUnicode_Check(unicode)) {
661 PyErr_BadArgument();
662 goto onError;
663 }
664
665 if (encoding == NULL)
666 encoding = PyUnicode_GetDefaultEncoding();
667
668 /* Encode via the codec registry */
669 v = PyCodec_Encode(unicode, encoding, errors);
670 if (v == NULL)
671 goto onError;
672 return v;
673
674 onError:
675 return NULL;
676}
677
Guido van Rossumd57fd912000-03-10 22:53:23 +0000678PyObject *PyUnicode_AsEncodedString(PyObject *unicode,
679 const char *encoding,
680 const char *errors)
681{
682 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +0000683
Guido van Rossumd57fd912000-03-10 22:53:23 +0000684 if (!PyUnicode_Check(unicode)) {
685 PyErr_BadArgument();
686 goto onError;
687 }
Fred Drakee4315f52000-05-09 19:53:39 +0000688
Tim Petersced69f82003-09-16 20:30:58 +0000689 if (encoding == NULL)
Fred Drakee4315f52000-05-09 19:53:39 +0000690 encoding = PyUnicode_GetDefaultEncoding();
691
692 /* Shortcuts for common default encodings */
693 if (errors == NULL) {
694 if (strcmp(encoding, "utf-8") == 0)
Jeremy Hylton9cea41c2001-05-29 17:13:15 +0000695 return PyUnicode_AsUTF8String(unicode);
Fred Drakee4315f52000-05-09 19:53:39 +0000696 else if (strcmp(encoding, "latin-1") == 0)
697 return PyUnicode_AsLatin1String(unicode);
Mark Hammond0ccda1e2003-07-01 00:13:27 +0000698#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
699 else if (strcmp(encoding, "mbcs") == 0)
700 return PyUnicode_AsMBCSString(unicode);
701#endif
Fred Drakee4315f52000-05-09 19:53:39 +0000702 else if (strcmp(encoding, "ascii") == 0)
703 return PyUnicode_AsASCIIString(unicode);
704 }
Guido van Rossumd57fd912000-03-10 22:53:23 +0000705
706 /* Encode via the codec registry */
707 v = PyCodec_Encode(unicode, encoding, errors);
708 if (v == NULL)
709 goto onError;
Guido van Rossumf15a29f2007-05-04 00:41:39 +0000710 if (!PyBytes_Check(v)) {
711 if (PyString_Check(v)) {
712 /* Old codec, turn it into bytes */
713 PyObject *b = PyBytes_FromObject(v);
714 Py_DECREF(v);
715 return b;
716 }
Guido van Rossumd57fd912000-03-10 22:53:23 +0000717 PyErr_Format(PyExc_TypeError,
Guido van Rossumf15a29f2007-05-04 00:41:39 +0000718 "encoder did not return a bytes object "
719 "(type=%.400s, encoding=%.20s, errors=%.20s)",
720 v->ob_type->tp_name,
721 encoding ? encoding : "NULL",
722 errors ? errors : "NULL");
Guido van Rossumd57fd912000-03-10 22:53:23 +0000723 Py_DECREF(v);
724 goto onError;
725 }
726 return v;
Tim Petersced69f82003-09-16 20:30:58 +0000727
Guido van Rossumd57fd912000-03-10 22:53:23 +0000728 onError:
729 return NULL;
730}
731
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000732PyObject *_PyUnicode_AsDefaultEncodedString(PyObject *unicode,
733 const char *errors)
734{
735 PyObject *v = ((PyUnicodeObject *)unicode)->defenc;
Guido van Rossumf15a29f2007-05-04 00:41:39 +0000736 PyObject *b;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000737 if (v)
738 return v;
Guido van Rossumf15a29f2007-05-04 00:41:39 +0000739 if (errors != NULL)
740 Py_FatalError("non-NULL encoding in _PyUnicode_AsDefaultEncodedString");
741 if (errors == NULL) {
742 b = PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
743 PyUnicode_GET_SIZE(unicode),
744 NULL);
745 }
746 else {
747 b = PyUnicode_AsEncodedString(unicode, NULL, errors);
748 }
749 if (!b)
750 return NULL;
751 v = PyString_FromStringAndSize(PyBytes_AsString(b),
752 PyBytes_Size(b));
753 Py_DECREF(b);
754 if (!errors) {
755 Py_XINCREF(v);
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000756 ((PyUnicodeObject *)unicode)->defenc = v;
Guido van Rossumf15a29f2007-05-04 00:41:39 +0000757 }
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000758 return v;
759}
760
Guido van Rossumd57fd912000-03-10 22:53:23 +0000761Py_UNICODE *PyUnicode_AsUnicode(PyObject *unicode)
762{
763 if (!PyUnicode_Check(unicode)) {
764 PyErr_BadArgument();
765 goto onError;
766 }
767 return PyUnicode_AS_UNICODE(unicode);
768
769 onError:
770 return NULL;
771}
772
Martin v. Löwis18e16552006-02-15 17:27:45 +0000773Py_ssize_t PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000774{
775 if (!PyUnicode_Check(unicode)) {
776 PyErr_BadArgument();
777 goto onError;
778 }
779 return PyUnicode_GET_SIZE(unicode);
780
781 onError:
782 return -1;
783}
784
Thomas Wouters78890102000-07-22 19:25:51 +0000785const char *PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +0000786{
787 return unicode_default_encoding;
788}
789
790int PyUnicode_SetDefaultEncoding(const char *encoding)
791{
Guido van Rossumf15a29f2007-05-04 00:41:39 +0000792 if (strcmp(encoding, unicode_default_encoding) != 0) {
793 PyErr_Format(PyExc_ValueError,
794 "Can only set default encoding to %s",
795 unicode_default_encoding);
796 return -1;
797 }
Fred Drakee4315f52000-05-09 19:53:39 +0000798 return 0;
Fred Drakee4315f52000-05-09 19:53:39 +0000799}
800
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000801/* error handling callback helper:
802 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +0000803 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000804 and adjust various state variables.
805 return 0 on success, -1 on error
806*/
807
808static
809int unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
810 const char *encoding, const char *reason,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000811 const char *input, Py_ssize_t insize, Py_ssize_t *startinpos, Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
812 PyObject **output, Py_ssize_t *outpos, Py_UNICODE **outptr)
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000813{
Martin v. Löwis18e16552006-02-15 17:27:45 +0000814 static char *argparse = "O!n;decoding error handler must return (unicode, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000815
816 PyObject *restuple = NULL;
817 PyObject *repunicode = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +0000818 Py_ssize_t outsize = PyUnicode_GET_SIZE(*output);
819 Py_ssize_t requiredsize;
820 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000821 Py_UNICODE *repptr;
Martin v. Löwis18e16552006-02-15 17:27:45 +0000822 Py_ssize_t repsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000823 int res = -1;
824
825 if (*errorHandler == NULL) {
826 *errorHandler = PyCodec_LookupError(errors);
827 if (*errorHandler == NULL)
828 goto onError;
829 }
830
831 if (*exceptionObject == NULL) {
832 *exceptionObject = PyUnicodeDecodeError_Create(
833 encoding, input, insize, *startinpos, *endinpos, reason);
834 if (*exceptionObject == NULL)
835 goto onError;
836 }
837 else {
838 if (PyUnicodeDecodeError_SetStart(*exceptionObject, *startinpos))
839 goto onError;
840 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, *endinpos))
841 goto onError;
842 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
843 goto onError;
844 }
845
846 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
847 if (restuple == NULL)
848 goto onError;
849 if (!PyTuple_Check(restuple)) {
850 PyErr_Format(PyExc_TypeError, &argparse[4]);
851 goto onError;
852 }
853 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
854 goto onError;
855 if (newpos<0)
Walter Dörwald2e0b18a2003-01-31 17:19:08 +0000856 newpos = insize+newpos;
857 if (newpos<0 || newpos>insize) {
Martin v. Löwis2c95cc62006-02-16 06:54:25 +0000858 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
Walter Dörwald2e0b18a2003-01-31 17:19:08 +0000859 goto onError;
860 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000861
862 /* need more space? (at least enough for what we
863 have+the replacement+the rest of the string (starting
864 at the new input position), so we won't have to check space
865 when there are no errors in the rest of the string) */
866 repptr = PyUnicode_AS_UNICODE(repunicode);
867 repsize = PyUnicode_GET_SIZE(repunicode);
868 requiredsize = *outpos + repsize + insize-newpos;
869 if (requiredsize > outsize) {
870 if (requiredsize<2*outsize)
871 requiredsize = 2*outsize;
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +0000872 if (PyUnicode_Resize(output, requiredsize) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000873 goto onError;
874 *outptr = PyUnicode_AS_UNICODE(*output) + *outpos;
875 }
876 *endinpos = newpos;
877 *inptr = input + newpos;
878 Py_UNICODE_COPY(*outptr, repptr, repsize);
879 *outptr += repsize;
880 *outpos += repsize;
881 /* we made it! */
882 res = 0;
883
884 onError:
885 Py_XDECREF(restuple);
886 return res;
887}
888
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000889/* --- UTF-7 Codec -------------------------------------------------------- */
890
891/* see RFC2152 for details */
892
Tim Petersced69f82003-09-16 20:30:58 +0000893static
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000894char utf7_special[128] = {
895 /* indicate whether a UTF-7 character is special i.e. cannot be directly
896 encoded:
897 0 - not special
898 1 - special
899 2 - whitespace (optional)
900 3 - RFC2152 Set O (optional) */
901 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 2, 1, 1,
902 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
903 2, 3, 3, 3, 3, 3, 3, 0, 0, 0, 3, 1, 0, 0, 0, 1,
904 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 0,
905 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
906 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 1, 3, 3, 3,
907 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
908 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 1, 1,
909
910};
911
Marc-André Lemburge115ec82005-10-19 22:33:31 +0000912/* Note: The comparison (c) <= 0 is a trick to work-around gcc
913 warnings about the comparison always being false; since
914 utf7_special[0] is 1, we can safely make that one comparison
915 true */
916
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000917#define SPECIAL(c, encodeO, encodeWS) \
Marc-André Lemburge115ec82005-10-19 22:33:31 +0000918 ((c) > 127 || (c) <= 0 || utf7_special[(c)] == 1 || \
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +0000919 (encodeWS && (utf7_special[(c)] == 2)) || \
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000920 (encodeO && (utf7_special[(c)] == 3)))
921
Marc-André Lemburge115ec82005-10-19 22:33:31 +0000922#define B64(n) \
923 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
924#define B64CHAR(c) \
925 (isalnum(c) || (c) == '+' || (c) == '/')
926#define UB64(c) \
927 ((c) == '+' ? 62 : (c) == '/' ? 63 : (c) >= 'a' ? \
928 (c) - 71 : (c) >= 'A' ? (c) - 65 : (c) + 4 )
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000929
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +0000930#define ENCODE(out, ch, bits) \
931 while (bits >= 6) { \
932 *out++ = B64(ch >> (bits-6)); \
933 bits -= 6; \
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000934 }
935
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +0000936#define DECODE(out, ch, bits, surrogate) \
937 while (bits >= 16) { \
938 Py_UNICODE outCh = (Py_UNICODE) ((ch >> (bits-16)) & 0xffff); \
939 bits -= 16; \
940 if (surrogate) { \
Marc-André Lemburge115ec82005-10-19 22:33:31 +0000941 /* We have already generated an error for the high surrogate \
942 so let's not bother seeing if the low surrogate is correct or not */ \
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +0000943 surrogate = 0; \
944 } else if (0xDC00 <= outCh && outCh <= 0xDFFF) { \
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000945 /* This is a surrogate pair. Unfortunately we can't represent \
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +0000946 it in a 16-bit character */ \
947 surrogate = 1; \
948 errmsg = "code pairs are not supported"; \
949 goto utf7Error; \
950 } else { \
951 *out++ = outCh; \
952 } \
Marc-André Lemburge115ec82005-10-19 22:33:31 +0000953 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000954
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000955PyObject *PyUnicode_DecodeUTF7(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000956 Py_ssize_t size,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000957 const char *errors)
958{
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000959 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +0000960 Py_ssize_t startinpos;
961 Py_ssize_t endinpos;
962 Py_ssize_t outpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000963 const char *e;
964 PyUnicodeObject *unicode;
965 Py_UNICODE *p;
966 const char *errmsg = "";
967 int inShift = 0;
968 unsigned int bitsleft = 0;
969 unsigned long charsleft = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000970 int surrogate = 0;
971 PyObject *errorHandler = NULL;
972 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000973
974 unicode = _PyUnicode_New(size);
975 if (!unicode)
976 return NULL;
977 if (size == 0)
978 return (PyObject *)unicode;
979
980 p = unicode->str;
981 e = s + size;
982
983 while (s < e) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000984 Py_UNICODE ch;
985 restart:
986 ch = *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000987
988 if (inShift) {
989 if ((ch == '-') || !B64CHAR(ch)) {
990 inShift = 0;
991 s++;
Tim Petersced69f82003-09-16 20:30:58 +0000992
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000993 /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
994 if (bitsleft >= 6) {
995 /* The shift sequence has a partial character in it. If
996 bitsleft < 6 then we could just classify it as padding
997 but that is not the case here */
998
999 errmsg = "partial character in shift sequence";
Tim Petersced69f82003-09-16 20:30:58 +00001000 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001001 }
1002 /* According to RFC2152 the remaining bits should be zero. We
Tim Petersced69f82003-09-16 20:30:58 +00001003 choose to signal an error/insert a replacement character
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001004 here so indicate the potential of a misencoded character. */
1005
1006 /* On x86, a << b == a << (b%32) so make sure that bitsleft != 0 */
1007 if (bitsleft && charsleft << (sizeof(charsleft) * 8 - bitsleft)) {
1008 errmsg = "non-zero padding bits in shift sequence";
Tim Petersced69f82003-09-16 20:30:58 +00001009 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001010 }
1011
1012 if (ch == '-') {
1013 if ((s < e) && (*(s) == '-')) {
Tim Petersced69f82003-09-16 20:30:58 +00001014 *p++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001015 inShift = 1;
1016 }
1017 } else if (SPECIAL(ch,0,0)) {
1018 errmsg = "unexpected special character";
Tim Petersced69f82003-09-16 20:30:58 +00001019 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001020 } else {
1021 *p++ = ch;
1022 }
1023 } else {
1024 charsleft = (charsleft << 6) | UB64(ch);
1025 bitsleft += 6;
1026 s++;
1027 /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
1028 }
1029 }
1030 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001031 startinpos = s-starts;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001032 s++;
1033 if (s < e && *s == '-') {
1034 s++;
1035 *p++ = '+';
1036 } else
1037 {
1038 inShift = 1;
1039 bitsleft = 0;
1040 }
1041 }
1042 else if (SPECIAL(ch,0,0)) {
1043 errmsg = "unexpected special character";
1044 s++;
Tim Petersced69f82003-09-16 20:30:58 +00001045 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001046 }
1047 else {
1048 *p++ = ch;
1049 s++;
1050 }
1051 continue;
1052 utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001053 outpos = p-PyUnicode_AS_UNICODE(unicode);
1054 endinpos = s-starts;
1055 if (unicode_decode_call_errorhandler(
1056 errors, &errorHandler,
1057 "utf7", errmsg,
1058 starts, size, &startinpos, &endinpos, &exc, &s,
1059 (PyObject **)&unicode, &outpos, &p))
1060 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001061 }
1062
1063 if (inShift) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001064 outpos = p-PyUnicode_AS_UNICODE(unicode);
1065 endinpos = size;
1066 if (unicode_decode_call_errorhandler(
1067 errors, &errorHandler,
1068 "utf7", "unterminated shift sequence",
1069 starts, size, &startinpos, &endinpos, &exc, &s,
1070 (PyObject **)&unicode, &outpos, &p))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001071 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001072 if (s < e)
1073 goto restart;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001074 }
1075
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00001076 if (_PyUnicode_Resize(&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001077 goto onError;
1078
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001079 Py_XDECREF(errorHandler);
1080 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001081 return (PyObject *)unicode;
1082
1083onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001084 Py_XDECREF(errorHandler);
1085 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001086 Py_DECREF(unicode);
1087 return NULL;
1088}
1089
1090
1091PyObject *PyUnicode_EncodeUTF7(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001092 Py_ssize_t size,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001093 int encodeSetO,
1094 int encodeWhiteSpace,
1095 const char *errors)
1096{
1097 PyObject *v;
1098 /* It might be possible to tighten this worst case */
Martin v. Löwis18e16552006-02-15 17:27:45 +00001099 Py_ssize_t cbAllocated = 5 * size;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001100 int inShift = 0;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001101 Py_ssize_t i = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001102 unsigned int bitsleft = 0;
1103 unsigned long charsleft = 0;
1104 char * out;
1105 char * start;
1106
1107 if (size == 0)
1108 return PyString_FromStringAndSize(NULL, 0);
1109
1110 v = PyString_FromStringAndSize(NULL, cbAllocated);
1111 if (v == NULL)
1112 return NULL;
1113
1114 start = out = PyString_AS_STRING(v);
1115 for (;i < size; ++i) {
1116 Py_UNICODE ch = s[i];
1117
1118 if (!inShift) {
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001119 if (ch == '+') {
1120 *out++ = '+';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001121 *out++ = '-';
1122 } else if (SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
1123 charsleft = ch;
1124 bitsleft = 16;
1125 *out++ = '+';
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001126 /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001127 inShift = bitsleft > 0;
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001128 } else {
1129 *out++ = (char) ch;
1130 }
1131 } else {
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001132 if (!SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
1133 *out++ = B64(charsleft << (6-bitsleft));
1134 charsleft = 0;
1135 bitsleft = 0;
1136 /* Characters not in the BASE64 set implicitly unshift the sequence
1137 so no '-' is required, except if the character is itself a '-' */
1138 if (B64CHAR(ch) || ch == '-') {
1139 *out++ = '-';
1140 }
1141 inShift = 0;
1142 *out++ = (char) ch;
1143 } else {
1144 bitsleft += 16;
1145 charsleft = (charsleft << 16) | ch;
1146 /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
1147
1148 /* If the next character is special then we dont' need to terminate
Tim Petersced69f82003-09-16 20:30:58 +00001149 the shift sequence. If the next character is not a BASE64 character
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001150 or '-' then the shift sequence will be terminated implicitly and we
1151 don't have to insert a '-'. */
1152
1153 if (bitsleft == 0) {
1154 if (i + 1 < size) {
1155 Py_UNICODE ch2 = s[i+1];
1156
1157 if (SPECIAL(ch2, encodeSetO, encodeWhiteSpace)) {
Tim Petersced69f82003-09-16 20:30:58 +00001158
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001159 } else if (B64CHAR(ch2) || ch2 == '-') {
1160 *out++ = '-';
1161 inShift = 0;
1162 } else {
1163 inShift = 0;
1164 }
1165
1166 }
1167 else {
1168 *out++ = '-';
1169 inShift = 0;
1170 }
1171 }
Tim Petersced69f82003-09-16 20:30:58 +00001172 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001173 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001174 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001175 if (bitsleft) {
1176 *out++= B64(charsleft << (6-bitsleft) );
1177 *out++ = '-';
1178 }
1179
Tim Peters5de98422002-04-27 18:44:32 +00001180 _PyString_Resize(&v, out - start);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001181 return v;
1182}
1183
1184#undef SPECIAL
1185#undef B64
1186#undef B64CHAR
1187#undef UB64
1188#undef ENCODE
1189#undef DECODE
1190
Guido van Rossumd57fd912000-03-10 22:53:23 +00001191/* --- UTF-8 Codec -------------------------------------------------------- */
1192
Tim Petersced69f82003-09-16 20:30:58 +00001193static
Guido van Rossumd57fd912000-03-10 22:53:23 +00001194char utf8_code_length[256] = {
1195 /* Map UTF-8 encoded prefix byte to sequence length. zero means
1196 illegal prefix. see RFC 2279 for details */
1197 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1198 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1199 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1200 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1201 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1202 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1203 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1204 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1205 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1206 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1207 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1208 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1209 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1210 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1211 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
1212 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0
1213};
1214
Guido van Rossumd57fd912000-03-10 22:53:23 +00001215PyObject *PyUnicode_DecodeUTF8(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001216 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00001217 const char *errors)
1218{
Walter Dörwald69652032004-09-07 20:24:22 +00001219 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
1220}
1221
1222PyObject *PyUnicode_DecodeUTF8Stateful(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001223 Py_ssize_t size,
Walter Dörwald69652032004-09-07 20:24:22 +00001224 const char *errors,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001225 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00001226{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001227 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001228 int n;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001229 Py_ssize_t startinpos;
1230 Py_ssize_t endinpos;
1231 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001232 const char *e;
1233 PyUnicodeObject *unicode;
1234 Py_UNICODE *p;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001235 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001236 PyObject *errorHandler = NULL;
1237 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001238
1239 /* Note: size will always be longer than the resulting Unicode
1240 character count */
1241 unicode = _PyUnicode_New(size);
1242 if (!unicode)
1243 return NULL;
Walter Dörwald69652032004-09-07 20:24:22 +00001244 if (size == 0) {
1245 if (consumed)
1246 *consumed = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001247 return (PyObject *)unicode;
Walter Dörwald69652032004-09-07 20:24:22 +00001248 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001249
1250 /* Unpack UTF-8 encoded data */
1251 p = unicode->str;
1252 e = s + size;
1253
1254 while (s < e) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001255 Py_UCS4 ch = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001256
1257 if (ch < 0x80) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001258 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001259 s++;
1260 continue;
1261 }
1262
1263 n = utf8_code_length[ch];
1264
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001265 if (s + n > e) {
Walter Dörwald69652032004-09-07 20:24:22 +00001266 if (consumed)
1267 break;
1268 else {
1269 errmsg = "unexpected end of data";
1270 startinpos = s-starts;
1271 endinpos = size;
1272 goto utf8Error;
1273 }
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001274 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001275
1276 switch (n) {
1277
1278 case 0:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001279 errmsg = "unexpected code byte";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001280 startinpos = s-starts;
1281 endinpos = startinpos+1;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001282 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001283
1284 case 1:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001285 errmsg = "internal error";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001286 startinpos = s-starts;
1287 endinpos = startinpos+1;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001288 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001289
1290 case 2:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001291 if ((s[1] & 0xc0) != 0x80) {
1292 errmsg = "invalid data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001293 startinpos = s-starts;
1294 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001295 goto utf8Error;
1296 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001297 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001298 if (ch < 0x80) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001299 startinpos = s-starts;
1300 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001301 errmsg = "illegal encoding";
1302 goto utf8Error;
1303 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001304 else
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001305 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001306 break;
1307
1308 case 3:
Tim Petersced69f82003-09-16 20:30:58 +00001309 if ((s[1] & 0xc0) != 0x80 ||
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001310 (s[2] & 0xc0) != 0x80) {
1311 errmsg = "invalid data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001312 startinpos = s-starts;
1313 endinpos = startinpos+3;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001314 goto utf8Error;
1315 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001316 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001317 if (ch < 0x0800) {
1318 /* Note: UTF-8 encodings of surrogates are considered
Tim Petersced69f82003-09-16 20:30:58 +00001319 legal UTF-8 sequences;
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001320
1321 XXX For wide builds (UCS-4) we should probably try
1322 to recombine the surrogates into a single code
1323 unit.
1324 */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001325 errmsg = "illegal encoding";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001326 startinpos = s-starts;
1327 endinpos = startinpos+3;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001328 goto utf8Error;
1329 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001330 else
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001331 *p++ = (Py_UNICODE)ch;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001332 break;
1333
1334 case 4:
1335 if ((s[1] & 0xc0) != 0x80 ||
1336 (s[2] & 0xc0) != 0x80 ||
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001337 (s[3] & 0xc0) != 0x80) {
1338 errmsg = "invalid data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001339 startinpos = s-starts;
1340 endinpos = startinpos+4;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001341 goto utf8Error;
1342 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001343 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
1344 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
1345 /* validate and convert to UTF-16 */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001346 if ((ch < 0x10000) /* minimum value allowed for 4
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001347 byte encoding */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001348 || (ch > 0x10ffff)) /* maximum value allowed for
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001349 UTF-16 */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001350 {
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001351 errmsg = "illegal encoding";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001352 startinpos = s-starts;
1353 endinpos = startinpos+4;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001354 goto utf8Error;
1355 }
Fredrik Lundh8f455852001-06-27 18:59:43 +00001356#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001357 *p++ = (Py_UNICODE)ch;
1358#else
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001359 /* compute and append the two surrogates: */
Tim Petersced69f82003-09-16 20:30:58 +00001360
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001361 /* translate from 10000..10FFFF to 0..FFFF */
1362 ch -= 0x10000;
Tim Petersced69f82003-09-16 20:30:58 +00001363
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001364 /* high surrogate = top 10 bits added to D800 */
1365 *p++ = (Py_UNICODE)(0xD800 + (ch >> 10));
Tim Petersced69f82003-09-16 20:30:58 +00001366
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001367 /* low surrogate = bottom 10 bits added to DC00 */
Fredrik Lundh45714e92001-06-26 16:39:36 +00001368 *p++ = (Py_UNICODE)(0xDC00 + (ch & 0x03FF));
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001369#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00001370 break;
1371
1372 default:
1373 /* Other sizes are only needed for UCS-4 */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001374 errmsg = "unsupported Unicode code range";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001375 startinpos = s-starts;
1376 endinpos = startinpos+n;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001377 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001378 }
1379 s += n;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001380 continue;
Tim Petersced69f82003-09-16 20:30:58 +00001381
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001382 utf8Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001383 outpos = p-PyUnicode_AS_UNICODE(unicode);
1384 if (unicode_decode_call_errorhandler(
1385 errors, &errorHandler,
1386 "utf8", errmsg,
1387 starts, size, &startinpos, &endinpos, &exc, &s,
1388 (PyObject **)&unicode, &outpos, &p))
1389 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001390 }
Walter Dörwald69652032004-09-07 20:24:22 +00001391 if (consumed)
1392 *consumed = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001393
1394 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00001395 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001396 goto onError;
1397
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001398 Py_XDECREF(errorHandler);
1399 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001400 return (PyObject *)unicode;
1401
1402onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001403 Py_XDECREF(errorHandler);
1404 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001405 Py_DECREF(unicode);
1406 return NULL;
1407}
1408
Tim Peters602f7402002-04-27 18:03:26 +00001409/* Allocation strategy: if the string is short, convert into a stack buffer
1410 and allocate exactly as much space needed at the end. Else allocate the
1411 maximum possible needed (4 result bytes per Unicode character), and return
1412 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001413*/
Tim Peters7e3d9612002-04-21 03:26:37 +00001414PyObject *
1415PyUnicode_EncodeUTF8(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001416 Py_ssize_t size,
Tim Peters7e3d9612002-04-21 03:26:37 +00001417 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001418{
Tim Peters602f7402002-04-27 18:03:26 +00001419#define MAX_SHORT_UNICHARS 300 /* largest size we'll do on the stack */
Tim Peters0eca65c2002-04-21 17:28:06 +00001420
Martin v. Löwis18e16552006-02-15 17:27:45 +00001421 Py_ssize_t i; /* index into s of next input byte */
Tim Peters602f7402002-04-27 18:03:26 +00001422 PyObject *v; /* result string object */
1423 char *p; /* next free byte in output buffer */
Martin v. Löwis18e16552006-02-15 17:27:45 +00001424 Py_ssize_t nallocated; /* number of result bytes allocated */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001425 Py_ssize_t nneeded; /* number of result bytes needed */
Tim Peters602f7402002-04-27 18:03:26 +00001426 char stackbuf[MAX_SHORT_UNICHARS * 4];
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001427
Tim Peters602f7402002-04-27 18:03:26 +00001428 assert(s != NULL);
1429 assert(size >= 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001430
Tim Peters602f7402002-04-27 18:03:26 +00001431 if (size <= MAX_SHORT_UNICHARS) {
1432 /* Write into the stack buffer; nallocated can't overflow.
1433 * At the end, we'll allocate exactly as much heap space as it
1434 * turns out we need.
1435 */
1436 nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int);
1437 v = NULL; /* will allocate after we're done */
1438 p = stackbuf;
1439 }
1440 else {
1441 /* Overallocate on the heap, and give the excess back at the end. */
1442 nallocated = size * 4;
1443 if (nallocated / 4 != size) /* overflow! */
1444 return PyErr_NoMemory();
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001445 v = PyBytes_FromStringAndSize(NULL, nallocated);
Tim Peters602f7402002-04-27 18:03:26 +00001446 if (v == NULL)
1447 return NULL;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001448 p = PyBytes_AS_STRING(v);
Tim Peters602f7402002-04-27 18:03:26 +00001449 }
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001450
Tim Peters602f7402002-04-27 18:03:26 +00001451 for (i = 0; i < size;) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001452 Py_UCS4 ch = s[i++];
Marc-André Lemburg3688a882002-02-06 18:09:02 +00001453
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001454 if (ch < 0x80)
Tim Peters602f7402002-04-27 18:03:26 +00001455 /* Encode ASCII */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001456 *p++ = (char) ch;
Marc-André Lemburg3688a882002-02-06 18:09:02 +00001457
Guido van Rossumd57fd912000-03-10 22:53:23 +00001458 else if (ch < 0x0800) {
Tim Peters602f7402002-04-27 18:03:26 +00001459 /* Encode Latin-1 */
Marc-André Lemburgdc724d62002-02-06 18:20:19 +00001460 *p++ = (char)(0xc0 | (ch >> 6));
1461 *p++ = (char)(0x80 | (ch & 0x3f));
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001462 }
Marc-André Lemburg3688a882002-02-06 18:09:02 +00001463 else {
Tim Peters602f7402002-04-27 18:03:26 +00001464 /* Encode UCS2 Unicode ordinals */
1465 if (ch < 0x10000) {
1466 /* Special case: check for high surrogate */
1467 if (0xD800 <= ch && ch <= 0xDBFF && i != size) {
1468 Py_UCS4 ch2 = s[i];
1469 /* Check for low surrogate and combine the two to
1470 form a UCS4 value */
1471 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001472 ch = ((ch - 0xD800) << 10 | (ch2 - 0xDC00)) + 0x10000;
Tim Peters602f7402002-04-27 18:03:26 +00001473 i++;
1474 goto encodeUCS4;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001475 }
Tim Peters602f7402002-04-27 18:03:26 +00001476 /* Fall through: handles isolated high surrogates */
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001477 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001478 *p++ = (char)(0xe0 | (ch >> 12));
Tim Peters602f7402002-04-27 18:03:26 +00001479 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
1480 *p++ = (char)(0x80 | (ch & 0x3f));
1481 continue;
1482 }
1483encodeUCS4:
1484 /* Encode UCS4 Unicode ordinals */
1485 *p++ = (char)(0xf0 | (ch >> 18));
1486 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
1487 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
1488 *p++ = (char)(0x80 | (ch & 0x3f));
1489 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001490 }
Tim Peters0eca65c2002-04-21 17:28:06 +00001491
Tim Peters602f7402002-04-27 18:03:26 +00001492 if (v == NULL) {
1493 /* This was stack allocated. */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001494 nneeded = p - stackbuf;
Tim Peters602f7402002-04-27 18:03:26 +00001495 assert(nneeded <= nallocated);
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001496 v = PyBytes_FromStringAndSize(stackbuf, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00001497 }
1498 else {
1499 /* Cut back to size actually needed. */
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001500 nneeded = p - PyBytes_AS_STRING(v);
Tim Peters602f7402002-04-27 18:03:26 +00001501 assert(nneeded <= nallocated);
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001502 PyBytes_Resize(v, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00001503 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001504 return v;
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001505
Tim Peters602f7402002-04-27 18:03:26 +00001506#undef MAX_SHORT_UNICHARS
Guido van Rossumd57fd912000-03-10 22:53:23 +00001507}
1508
Guido van Rossumd57fd912000-03-10 22:53:23 +00001509PyObject *PyUnicode_AsUTF8String(PyObject *unicode)
1510{
Guido van Rossumd57fd912000-03-10 22:53:23 +00001511 if (!PyUnicode_Check(unicode)) {
1512 PyErr_BadArgument();
1513 return NULL;
1514 }
Barry Warsaw2dd4abf2000-08-18 06:58:15 +00001515 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
1516 PyUnicode_GET_SIZE(unicode),
1517 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001518}
1519
1520/* --- UTF-16 Codec ------------------------------------------------------- */
1521
Tim Peters772747b2001-08-09 22:21:55 +00001522PyObject *
1523PyUnicode_DecodeUTF16(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001524 Py_ssize_t size,
Tim Peters772747b2001-08-09 22:21:55 +00001525 const char *errors,
1526 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001527{
Walter Dörwald69652032004-09-07 20:24:22 +00001528 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
1529}
1530
1531PyObject *
1532PyUnicode_DecodeUTF16Stateful(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001533 Py_ssize_t size,
Walter Dörwald69652032004-09-07 20:24:22 +00001534 const char *errors,
1535 int *byteorder,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001536 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00001537{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001538 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001539 Py_ssize_t startinpos;
1540 Py_ssize_t endinpos;
1541 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001542 PyUnicodeObject *unicode;
1543 Py_UNICODE *p;
Tim Peters772747b2001-08-09 22:21:55 +00001544 const unsigned char *q, *e;
1545 int bo = 0; /* assume native ordering by default */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001546 const char *errmsg = "";
Tim Peters772747b2001-08-09 22:21:55 +00001547 /* Offsets from q for retrieving byte pairs in the right order. */
1548#ifdef BYTEORDER_IS_LITTLE_ENDIAN
1549 int ihi = 1, ilo = 0;
1550#else
1551 int ihi = 0, ilo = 1;
1552#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001553 PyObject *errorHandler = NULL;
1554 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001555
1556 /* Note: size will always be longer than the resulting Unicode
1557 character count */
1558 unicode = _PyUnicode_New(size);
1559 if (!unicode)
1560 return NULL;
1561 if (size == 0)
1562 return (PyObject *)unicode;
1563
1564 /* Unpack UTF-16 encoded data */
1565 p = unicode->str;
Tim Peters772747b2001-08-09 22:21:55 +00001566 q = (unsigned char *)s;
1567 e = q + size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001568
1569 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00001570 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001571
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001572 /* Check for BOM marks (U+FEFF) in the input and adjust current
1573 byte order setting accordingly. In native mode, the leading BOM
1574 mark is skipped, in all other modes, it is copied to the output
1575 stream as-is (giving a ZWNBSP character). */
1576 if (bo == 0) {
Walter Dörwald69652032004-09-07 20:24:22 +00001577 if (size >= 2) {
1578 const Py_UNICODE bom = (q[ihi] << 8) | q[ilo];
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001579#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Walter Dörwald69652032004-09-07 20:24:22 +00001580 if (bom == 0xFEFF) {
1581 q += 2;
1582 bo = -1;
1583 }
1584 else if (bom == 0xFFFE) {
1585 q += 2;
1586 bo = 1;
1587 }
Tim Petersced69f82003-09-16 20:30:58 +00001588#else
Walter Dörwald69652032004-09-07 20:24:22 +00001589 if (bom == 0xFEFF) {
1590 q += 2;
1591 bo = 1;
1592 }
1593 else if (bom == 0xFFFE) {
1594 q += 2;
1595 bo = -1;
1596 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001597#endif
Walter Dörwald69652032004-09-07 20:24:22 +00001598 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001599 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001600
Tim Peters772747b2001-08-09 22:21:55 +00001601 if (bo == -1) {
1602 /* force LE */
1603 ihi = 1;
1604 ilo = 0;
1605 }
1606 else if (bo == 1) {
1607 /* force BE */
1608 ihi = 0;
1609 ilo = 1;
1610 }
1611
1612 while (q < e) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001613 Py_UNICODE ch;
Walter Dörwald69652032004-09-07 20:24:22 +00001614 /* remaining bytes at the end? (size should be even) */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001615 if (e-q<2) {
Walter Dörwald69652032004-09-07 20:24:22 +00001616 if (consumed)
1617 break;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001618 errmsg = "truncated data";
1619 startinpos = ((const char *)q)-starts;
1620 endinpos = ((const char *)e)-starts;
1621 goto utf16Error;
1622 /* The remaining input chars are ignored if the callback
1623 chooses to skip the input */
1624 }
1625 ch = (q[ihi] << 8) | q[ilo];
1626
Tim Peters772747b2001-08-09 22:21:55 +00001627 q += 2;
1628
Guido van Rossumd57fd912000-03-10 22:53:23 +00001629 if (ch < 0xD800 || ch > 0xDFFF) {
1630 *p++ = ch;
1631 continue;
1632 }
1633
1634 /* UTF-16 code pair: */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001635 if (q >= e) {
1636 errmsg = "unexpected end of data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001637 startinpos = (((const char *)q)-2)-starts;
1638 endinpos = ((const char *)e)-starts;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001639 goto utf16Error;
1640 }
Martin v. Löwisac93bc22001-06-26 22:43:40 +00001641 if (0xD800 <= ch && ch <= 0xDBFF) {
Tim Peters772747b2001-08-09 22:21:55 +00001642 Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo];
1643 q += 2;
Martin v. Löwisac93bc22001-06-26 22:43:40 +00001644 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Fredrik Lundh8f455852001-06-27 18:59:43 +00001645#ifndef Py_UNICODE_WIDE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001646 *p++ = ch;
1647 *p++ = ch2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001648#else
1649 *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001650#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001651 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001652 }
1653 else {
1654 errmsg = "illegal UTF-16 surrogate";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001655 startinpos = (((const char *)q)-4)-starts;
1656 endinpos = startinpos+2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001657 goto utf16Error;
1658 }
1659
Guido van Rossumd57fd912000-03-10 22:53:23 +00001660 }
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001661 errmsg = "illegal encoding";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001662 startinpos = (((const char *)q)-2)-starts;
1663 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001664 /* Fall through to report the error */
1665
1666 utf16Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001667 outpos = p-PyUnicode_AS_UNICODE(unicode);
1668 if (unicode_decode_call_errorhandler(
1669 errors, &errorHandler,
1670 "utf16", errmsg,
1671 starts, size, &startinpos, &endinpos, &exc, (const char **)&q,
1672 (PyObject **)&unicode, &outpos, &p))
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001673 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001674 }
1675
1676 if (byteorder)
1677 *byteorder = bo;
1678
Walter Dörwald69652032004-09-07 20:24:22 +00001679 if (consumed)
1680 *consumed = (const char *)q-starts;
1681
Guido van Rossumd57fd912000-03-10 22:53:23 +00001682 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00001683 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001684 goto onError;
1685
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001686 Py_XDECREF(errorHandler);
1687 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001688 return (PyObject *)unicode;
1689
1690onError:
1691 Py_DECREF(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001692 Py_XDECREF(errorHandler);
1693 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001694 return NULL;
1695}
1696
Tim Peters772747b2001-08-09 22:21:55 +00001697PyObject *
1698PyUnicode_EncodeUTF16(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001699 Py_ssize_t size,
Tim Peters772747b2001-08-09 22:21:55 +00001700 const char *errors,
1701 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001702{
1703 PyObject *v;
Tim Peters772747b2001-08-09 22:21:55 +00001704 unsigned char *p;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00001705#ifdef Py_UNICODE_WIDE
Tim Peters772747b2001-08-09 22:21:55 +00001706 int i, pairs;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00001707#else
1708 const int pairs = 0;
1709#endif
Tim Peters772747b2001-08-09 22:21:55 +00001710 /* Offsets from p for storing byte pairs in the right order. */
1711#ifdef BYTEORDER_IS_LITTLE_ENDIAN
1712 int ihi = 1, ilo = 0;
1713#else
1714 int ihi = 0, ilo = 1;
1715#endif
1716
1717#define STORECHAR(CH) \
1718 do { \
1719 p[ihi] = ((CH) >> 8) & 0xff; \
1720 p[ilo] = (CH) & 0xff; \
1721 p += 2; \
1722 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001723
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00001724#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001725 for (i = pairs = 0; i < size; i++)
1726 if (s[i] >= 0x10000)
1727 pairs++;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00001728#endif
Tim Petersced69f82003-09-16 20:30:58 +00001729 v = PyString_FromStringAndSize(NULL,
Tim Peters772747b2001-08-09 22:21:55 +00001730 2 * (size + pairs + (byteorder == 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +00001731 if (v == NULL)
1732 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001733
Tim Peters772747b2001-08-09 22:21:55 +00001734 p = (unsigned char *)PyString_AS_STRING(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001735 if (byteorder == 0)
Tim Peters772747b2001-08-09 22:21:55 +00001736 STORECHAR(0xFEFF);
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00001737 if (size == 0)
Marc-André Lemburgb7520772000-08-14 11:29:19 +00001738 return v;
Tim Peters772747b2001-08-09 22:21:55 +00001739
1740 if (byteorder == -1) {
1741 /* force LE */
1742 ihi = 1;
1743 ilo = 0;
1744 }
1745 else if (byteorder == 1) {
1746 /* force BE */
1747 ihi = 0;
1748 ilo = 1;
1749 }
1750
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001751 while (size-- > 0) {
1752 Py_UNICODE ch = *s++;
1753 Py_UNICODE ch2 = 0;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00001754#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001755 if (ch >= 0x10000) {
Tim Peters772747b2001-08-09 22:21:55 +00001756 ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF);
1757 ch = 0xD800 | ((ch-0x10000) >> 10);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001758 }
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00001759#endif
Tim Peters772747b2001-08-09 22:21:55 +00001760 STORECHAR(ch);
1761 if (ch2)
1762 STORECHAR(ch2);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001763 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001764 return v;
Tim Peters772747b2001-08-09 22:21:55 +00001765#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00001766}
1767
1768PyObject *PyUnicode_AsUTF16String(PyObject *unicode)
1769{
1770 if (!PyUnicode_Check(unicode)) {
1771 PyErr_BadArgument();
1772 return NULL;
1773 }
1774 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
1775 PyUnicode_GET_SIZE(unicode),
1776 NULL,
1777 0);
1778}
1779
1780/* --- Unicode Escape Codec ----------------------------------------------- */
1781
Fredrik Lundh06d12682001-01-24 07:59:11 +00001782static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00001783
Guido van Rossumd57fd912000-03-10 22:53:23 +00001784PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001785 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00001786 const char *errors)
1787{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001788 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001789 Py_ssize_t startinpos;
1790 Py_ssize_t endinpos;
1791 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001792 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001793 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001794 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001795 const char *end;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001796 char* message;
1797 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001798 PyObject *errorHandler = NULL;
1799 PyObject *exc = NULL;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001800
Guido van Rossumd57fd912000-03-10 22:53:23 +00001801 /* Escaped strings will always be longer than the resulting
1802 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001803 length after conversion to the true value.
1804 (but if the error callback returns a long replacement string
1805 we'll have to allocate more space) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001806 v = _PyUnicode_New(size);
1807 if (v == NULL)
1808 goto onError;
1809 if (size == 0)
1810 return (PyObject *)v;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001811
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001812 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001813 end = s + size;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001814
Guido van Rossumd57fd912000-03-10 22:53:23 +00001815 while (s < end) {
1816 unsigned char c;
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00001817 Py_UNICODE x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001818 int digits;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001819
1820 /* Non-escape characters are interpreted as Unicode ordinals */
1821 if (*s != '\\') {
Fredrik Lundhccc74732001-02-18 22:13:49 +00001822 *p++ = (unsigned char) *s++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001823 continue;
1824 }
1825
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001826 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001827 /* \ - Escapes */
1828 s++;
1829 switch (*s++) {
1830
1831 /* \x escapes */
1832 case '\n': break;
1833 case '\\': *p++ = '\\'; break;
1834 case '\'': *p++ = '\''; break;
1835 case '\"': *p++ = '\"'; break;
1836 case 'b': *p++ = '\b'; break;
1837 case 'f': *p++ = '\014'; break; /* FF */
1838 case 't': *p++ = '\t'; break;
1839 case 'n': *p++ = '\n'; break;
1840 case 'r': *p++ = '\r'; break;
1841 case 'v': *p++ = '\013'; break; /* VT */
1842 case 'a': *p++ = '\007'; break; /* BEL, not classic C */
1843
1844 /* \OOO (octal) escapes */
1845 case '0': case '1': case '2': case '3':
1846 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001847 x = s[-1] - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00001848 if ('0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001849 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00001850 if ('0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001851 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00001852 }
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001853 *p++ = x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001854 break;
1855
Fredrik Lundhccc74732001-02-18 22:13:49 +00001856 /* hex escapes */
1857 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001858 case 'x':
Fredrik Lundhccc74732001-02-18 22:13:49 +00001859 digits = 2;
1860 message = "truncated \\xXX escape";
1861 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001862
Fredrik Lundhccc74732001-02-18 22:13:49 +00001863 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001864 case 'u':
Fredrik Lundhccc74732001-02-18 22:13:49 +00001865 digits = 4;
1866 message = "truncated \\uXXXX escape";
1867 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001868
Fredrik Lundhccc74732001-02-18 22:13:49 +00001869 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00001870 case 'U':
Fredrik Lundhccc74732001-02-18 22:13:49 +00001871 digits = 8;
1872 message = "truncated \\UXXXXXXXX escape";
1873 hexescape:
1874 chr = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001875 outpos = p-PyUnicode_AS_UNICODE(v);
1876 if (s+digits>end) {
1877 endinpos = size;
1878 if (unicode_decode_call_errorhandler(
1879 errors, &errorHandler,
1880 "unicodeescape", "end of string in escape sequence",
1881 starts, size, &startinpos, &endinpos, &exc, &s,
1882 (PyObject **)&v, &outpos, &p))
1883 goto onError;
1884 goto nextByte;
1885 }
1886 for (i = 0; i < digits; ++i) {
Fredrik Lundhccc74732001-02-18 22:13:49 +00001887 c = (unsigned char) s[i];
Fredrik Lundhdf846752000-09-03 11:29:49 +00001888 if (!isxdigit(c)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001889 endinpos = (s+i+1)-starts;
1890 if (unicode_decode_call_errorhandler(
1891 errors, &errorHandler,
1892 "unicodeescape", message,
1893 starts, size, &startinpos, &endinpos, &exc, &s,
1894 (PyObject **)&v, &outpos, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00001895 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001896 goto nextByte;
Fredrik Lundhdf846752000-09-03 11:29:49 +00001897 }
1898 chr = (chr<<4) & ~0xF;
1899 if (c >= '0' && c <= '9')
1900 chr += c - '0';
1901 else if (c >= 'a' && c <= 'f')
1902 chr += 10 + c - 'a';
1903 else
1904 chr += 10 + c - 'A';
1905 }
1906 s += i;
Jeremy Hylton504de6b2003-10-06 05:08:26 +00001907 if (chr == 0xffffffff && PyErr_Occurred())
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001908 /* _decoding_error will have already written into the
1909 target buffer. */
1910 break;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001911 store:
Fredrik Lundhdf846752000-09-03 11:29:49 +00001912 /* when we get here, chr is a 32-bit unicode character */
1913 if (chr <= 0xffff)
1914 /* UCS-2 character */
1915 *p++ = (Py_UNICODE) chr;
1916 else if (chr <= 0x10ffff) {
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001917 /* UCS-4 character. Either store directly, or as
Walter Dörwald8c077222002-03-25 11:16:18 +00001918 surrogate pair. */
Fredrik Lundh8f455852001-06-27 18:59:43 +00001919#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001920 *p++ = chr;
1921#else
Fredrik Lundhdf846752000-09-03 11:29:49 +00001922 chr -= 0x10000L;
1923 *p++ = 0xD800 + (Py_UNICODE) (chr >> 10);
Fredrik Lundh45714e92001-06-26 16:39:36 +00001924 *p++ = 0xDC00 + (Py_UNICODE) (chr & 0x03FF);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001925#endif
Fredrik Lundhdf846752000-09-03 11:29:49 +00001926 } else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001927 endinpos = s-starts;
1928 outpos = p-PyUnicode_AS_UNICODE(v);
1929 if (unicode_decode_call_errorhandler(
1930 errors, &errorHandler,
1931 "unicodeescape", "illegal Unicode character",
1932 starts, size, &startinpos, &endinpos, &exc, &s,
1933 (PyObject **)&v, &outpos, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00001934 goto onError;
1935 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00001936 break;
1937
1938 /* \N{name} */
1939 case 'N':
1940 message = "malformed \\N character escape";
1941 if (ucnhash_CAPI == NULL) {
1942 /* load the unicode data module */
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00001943 PyObject *m, *api;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001944 m = PyImport_ImportModule("unicodedata");
1945 if (m == NULL)
1946 goto ucnhashError;
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00001947 api = PyObject_GetAttrString(m, "ucnhash_CAPI");
Fredrik Lundhccc74732001-02-18 22:13:49 +00001948 Py_DECREF(m);
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00001949 if (api == NULL)
Fredrik Lundhccc74732001-02-18 22:13:49 +00001950 goto ucnhashError;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001951 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCObject_AsVoidPtr(api);
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00001952 Py_DECREF(api);
Fredrik Lundhccc74732001-02-18 22:13:49 +00001953 if (ucnhash_CAPI == NULL)
1954 goto ucnhashError;
1955 }
1956 if (*s == '{') {
1957 const char *start = s+1;
1958 /* look for the closing brace */
1959 while (*s != '}' && s < end)
1960 s++;
1961 if (s > start && s < end && *s == '}') {
1962 /* found a name. look it up in the unicode database */
1963 message = "unknown Unicode character name";
1964 s++;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +00001965 if (ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1), &chr))
Fredrik Lundhccc74732001-02-18 22:13:49 +00001966 goto store;
1967 }
1968 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001969 endinpos = s-starts;
1970 outpos = p-PyUnicode_AS_UNICODE(v);
1971 if (unicode_decode_call_errorhandler(
1972 errors, &errorHandler,
1973 "unicodeescape", message,
1974 starts, size, &startinpos, &endinpos, &exc, &s,
1975 (PyObject **)&v, &outpos, &p))
Fredrik Lundhccc74732001-02-18 22:13:49 +00001976 goto onError;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001977 break;
1978
1979 default:
Walter Dörwald8c077222002-03-25 11:16:18 +00001980 if (s > end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001981 message = "\\ at end of string";
1982 s--;
1983 endinpos = s-starts;
1984 outpos = p-PyUnicode_AS_UNICODE(v);
1985 if (unicode_decode_call_errorhandler(
1986 errors, &errorHandler,
1987 "unicodeescape", message,
1988 starts, size, &startinpos, &endinpos, &exc, &s,
1989 (PyObject **)&v, &outpos, &p))
Walter Dörwald8c077222002-03-25 11:16:18 +00001990 goto onError;
1991 }
1992 else {
1993 *p++ = '\\';
1994 *p++ = (unsigned char)s[-1];
1995 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00001996 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001997 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001998 nextByte:
1999 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002000 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002001 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002002 goto onError;
Walter Dörwaldd4ade082003-08-15 15:00:26 +00002003 Py_XDECREF(errorHandler);
2004 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002005 return (PyObject *)v;
Walter Dörwald8c077222002-03-25 11:16:18 +00002006
Fredrik Lundhccc74732001-02-18 22:13:49 +00002007ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00002008 PyErr_SetString(
2009 PyExc_UnicodeError,
2010 "\\N escapes not supported (can't load unicodedata module)"
2011 );
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00002012 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002013 Py_XDECREF(errorHandler);
2014 Py_XDECREF(exc);
Fredrik Lundhf6056062001-01-20 11:15:25 +00002015 return NULL;
2016
Fredrik Lundhccc74732001-02-18 22:13:49 +00002017onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00002018 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002019 Py_XDECREF(errorHandler);
2020 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002021 return NULL;
2022}
2023
2024/* Return a Unicode-Escape string version of the Unicode object.
2025
2026 If quotes is true, the string is enclosed in u"" or u'' quotes as
2027 appropriate.
2028
2029*/
2030
Thomas Wouters477c8d52006-05-27 19:21:47 +00002031Py_LOCAL_INLINE(const Py_UNICODE *) findchar(const Py_UNICODE *s,
2032 Py_ssize_t size,
2033 Py_UNICODE ch)
2034{
2035 /* like wcschr, but doesn't stop at NULL characters */
2036
2037 while (size-- > 0) {
2038 if (*s == ch)
2039 return s;
2040 s++;
2041 }
2042
2043 return NULL;
2044}
Barry Warsaw51ac5802000-03-20 16:36:48 +00002045
Guido van Rossumd57fd912000-03-10 22:53:23 +00002046static
2047PyObject *unicodeescape_string(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002048 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00002049 int quotes)
2050{
2051 PyObject *repr;
2052 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002053
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00002054 static const char *hexdigit = "0123456789abcdef";
Guido van Rossumd57fd912000-03-10 22:53:23 +00002055
Thomas Wouters89f507f2006-12-13 04:49:30 +00002056 /* XXX(nnorwitz): rather than over-allocating, it would be
2057 better to choose a different scheme. Perhaps scan the
2058 first N-chars of the string and allocate based on that size.
2059 */
2060 /* Initial allocation is based on the longest-possible unichr
2061 escape.
2062
2063 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
2064 unichr, so in this case it's the longest unichr escape. In
2065 narrow (UTF-16) builds this is five chars per source unichr
2066 since there are two unichrs in the surrogate pair, so in narrow
2067 (UTF-16) builds it's not the longest unichr escape.
2068
2069 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
2070 so in the narrow (UTF-16) build case it's the longest unichr
2071 escape.
2072 */
2073
2074 repr = PyString_FromStringAndSize(NULL,
2075 2
2076#ifdef Py_UNICODE_WIDE
2077 + 10*size
2078#else
2079 + 6*size
2080#endif
2081 + 1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002082 if (repr == NULL)
2083 return NULL;
2084
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002085 p = PyString_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002086
2087 if (quotes) {
Tim Petersced69f82003-09-16 20:30:58 +00002088 *p++ = (findchar(s, size, '\'') &&
Guido van Rossumd57fd912000-03-10 22:53:23 +00002089 !findchar(s, size, '"')) ? '"' : '\'';
2090 }
2091 while (size-- > 0) {
2092 Py_UNICODE ch = *s++;
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002093
Hye-Shik Chang835b2432005-12-17 04:38:31 +00002094 /* Escape quotes and backslashes */
2095 if ((quotes &&
Guido van Rossum572dbf82007-04-27 23:53:51 +00002096 ch == (Py_UNICODE) PyString_AS_STRING(repr)[0]) || ch == '\\') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002097 *p++ = '\\';
2098 *p++ = (char) ch;
Guido van Rossumad9744a2001-09-21 15:38:17 +00002099 continue;
Tim Petersced69f82003-09-16 20:30:58 +00002100 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002101
Guido van Rossum0d42e0c2001-07-20 16:36:21 +00002102#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002103 /* Map 21-bit characters to '\U00xxxxxx' */
2104 else if (ch >= 0x10000) {
2105 *p++ = '\\';
2106 *p++ = 'U';
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002107 *p++ = hexdigit[(ch >> 28) & 0x0000000F];
2108 *p++ = hexdigit[(ch >> 24) & 0x0000000F];
2109 *p++ = hexdigit[(ch >> 20) & 0x0000000F];
2110 *p++ = hexdigit[(ch >> 16) & 0x0000000F];
2111 *p++ = hexdigit[(ch >> 12) & 0x0000000F];
2112 *p++ = hexdigit[(ch >> 8) & 0x0000000F];
2113 *p++ = hexdigit[(ch >> 4) & 0x0000000F];
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002114 *p++ = hexdigit[ch & 0x0000000F];
2115 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002116 }
Thomas Wouters89f507f2006-12-13 04:49:30 +00002117#else
2118 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002119 else if (ch >= 0xD800 && ch < 0xDC00) {
2120 Py_UNICODE ch2;
2121 Py_UCS4 ucs;
Tim Petersced69f82003-09-16 20:30:58 +00002122
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002123 ch2 = *s++;
2124 size--;
2125 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
2126 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
2127 *p++ = '\\';
2128 *p++ = 'U';
2129 *p++ = hexdigit[(ucs >> 28) & 0x0000000F];
2130 *p++ = hexdigit[(ucs >> 24) & 0x0000000F];
2131 *p++ = hexdigit[(ucs >> 20) & 0x0000000F];
2132 *p++ = hexdigit[(ucs >> 16) & 0x0000000F];
2133 *p++ = hexdigit[(ucs >> 12) & 0x0000000F];
2134 *p++ = hexdigit[(ucs >> 8) & 0x0000000F];
2135 *p++ = hexdigit[(ucs >> 4) & 0x0000000F];
2136 *p++ = hexdigit[ucs & 0x0000000F];
2137 continue;
2138 }
2139 /* Fall through: isolated surrogates are copied as-is */
2140 s--;
2141 size++;
2142 }
Thomas Wouters89f507f2006-12-13 04:49:30 +00002143#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002144
Guido van Rossumd57fd912000-03-10 22:53:23 +00002145 /* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002146 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002147 *p++ = '\\';
2148 *p++ = 'u';
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002149 *p++ = hexdigit[(ch >> 12) & 0x000F];
2150 *p++ = hexdigit[(ch >> 8) & 0x000F];
2151 *p++ = hexdigit[(ch >> 4) & 0x000F];
2152 *p++ = hexdigit[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00002153 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002154
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00002155 /* Map special whitespace to '\t', \n', '\r' */
2156 else if (ch == '\t') {
2157 *p++ = '\\';
2158 *p++ = 't';
2159 }
2160 else if (ch == '\n') {
2161 *p++ = '\\';
2162 *p++ = 'n';
2163 }
2164 else if (ch == '\r') {
2165 *p++ = '\\';
2166 *p++ = 'r';
2167 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002168
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00002169 /* Map non-printable US ASCII to '\xhh' */
Marc-André Lemburg11326de2001-11-28 12:56:20 +00002170 else if (ch < ' ' || ch >= 0x7F) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002171 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00002172 *p++ = 'x';
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002173 *p++ = hexdigit[(ch >> 4) & 0x000F];
2174 *p++ = hexdigit[ch & 0x000F];
Tim Petersced69f82003-09-16 20:30:58 +00002175 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002176
Guido van Rossumd57fd912000-03-10 22:53:23 +00002177 /* Copy everything else as-is */
2178 else
2179 *p++ = (char) ch;
2180 }
2181 if (quotes)
Guido van Rossum572dbf82007-04-27 23:53:51 +00002182 *p++ = PyString_AS_STRING(repr)[0];
Guido van Rossumd57fd912000-03-10 22:53:23 +00002183
2184 *p = '\0';
Tim Peters5de98422002-04-27 18:44:32 +00002185 _PyString_Resize(&repr, p - PyString_AS_STRING(repr));
Guido van Rossumd57fd912000-03-10 22:53:23 +00002186 return repr;
2187}
2188
2189PyObject *PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002190 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002191{
2192 return unicodeescape_string(s, size, 0);
2193}
2194
2195PyObject *PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
2196{
2197 if (!PyUnicode_Check(unicode)) {
2198 PyErr_BadArgument();
2199 return NULL;
2200 }
2201 return PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
2202 PyUnicode_GET_SIZE(unicode));
2203}
2204
2205/* --- Raw Unicode Escape Codec ------------------------------------------- */
2206
2207PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002208 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00002209 const char *errors)
2210{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002211 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002212 Py_ssize_t startinpos;
2213 Py_ssize_t endinpos;
2214 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002215 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002216 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002217 const char *end;
2218 const char *bs;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002219 PyObject *errorHandler = NULL;
2220 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00002221
Guido van Rossumd57fd912000-03-10 22:53:23 +00002222 /* Escaped strings will always be longer than the resulting
2223 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002224 length after conversion to the true value. (But decoding error
2225 handler might have to resize the string) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002226 v = _PyUnicode_New(size);
2227 if (v == NULL)
2228 goto onError;
2229 if (size == 0)
2230 return (PyObject *)v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002231 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002232 end = s + size;
2233 while (s < end) {
2234 unsigned char c;
Martin v. Löwis047c05e2002-03-21 08:55:28 +00002235 Py_UCS4 x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002236 int i;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002237 int count;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002238
2239 /* Non-escape characters are interpreted as Unicode ordinals */
2240 if (*s != '\\') {
2241 *p++ = (unsigned char)*s++;
2242 continue;
2243 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002244 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002245
2246 /* \u-escapes are only interpreted iff the number of leading
2247 backslashes if odd */
2248 bs = s;
2249 for (;s < end;) {
2250 if (*s != '\\')
2251 break;
2252 *p++ = (unsigned char)*s++;
2253 }
2254 if (((s - bs) & 1) == 0 ||
2255 s >= end ||
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002256 (*s != 'u' && *s != 'U')) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002257 continue;
2258 }
2259 p--;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002260 count = *s=='u' ? 4 : 8;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002261 s++;
2262
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002263 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002264 outpos = p-PyUnicode_AS_UNICODE(v);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002265 for (x = 0, i = 0; i < count; ++i, ++s) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002266 c = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002267 if (!isxdigit(c)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002268 endinpos = s-starts;
2269 if (unicode_decode_call_errorhandler(
2270 errors, &errorHandler,
2271 "rawunicodeescape", "truncated \\uXXXX",
2272 starts, size, &startinpos, &endinpos, &exc, &s,
2273 (PyObject **)&v, &outpos, &p))
Guido van Rossumd57fd912000-03-10 22:53:23 +00002274 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002275 goto nextByte;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002276 }
2277 x = (x<<4) & ~0xF;
2278 if (c >= '0' && c <= '9')
2279 x += c - '0';
2280 else if (c >= 'a' && c <= 'f')
2281 x += 10 + c - 'a';
2282 else
2283 x += 10 + c - 'A';
2284 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002285#ifndef Py_UNICODE_WIDE
2286 if (x > 0x10000) {
2287 if (unicode_decode_call_errorhandler(
2288 errors, &errorHandler,
2289 "rawunicodeescape", "\\Uxxxxxxxx out of range",
2290 starts, size, &startinpos, &endinpos, &exc, &s,
2291 (PyObject **)&v, &outpos, &p))
2292 goto onError;
2293 }
2294#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002295 *p++ = x;
2296 nextByte:
2297 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002298 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002299 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002300 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002301 Py_XDECREF(errorHandler);
2302 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002303 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00002304
Guido van Rossumd57fd912000-03-10 22:53:23 +00002305 onError:
2306 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002307 Py_XDECREF(errorHandler);
2308 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002309 return NULL;
2310}
2311
2312PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002313 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002314{
2315 PyObject *repr;
2316 char *p;
2317 char *q;
2318
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00002319 static const char *hexdigit = "0123456789abcdef";
Guido van Rossumd57fd912000-03-10 22:53:23 +00002320
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002321#ifdef Py_UNICODE_WIDE
2322 repr = PyString_FromStringAndSize(NULL, 10 * size);
2323#else
Guido van Rossumd57fd912000-03-10 22:53:23 +00002324 repr = PyString_FromStringAndSize(NULL, 6 * size);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002325#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00002326 if (repr == NULL)
2327 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00002328 if (size == 0)
2329 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002330
2331 p = q = PyString_AS_STRING(repr);
2332 while (size-- > 0) {
2333 Py_UNICODE ch = *s++;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002334#ifdef Py_UNICODE_WIDE
2335 /* Map 32-bit characters to '\Uxxxxxxxx' */
2336 if (ch >= 0x10000) {
2337 *p++ = '\\';
2338 *p++ = 'U';
2339 *p++ = hexdigit[(ch >> 28) & 0xf];
2340 *p++ = hexdigit[(ch >> 24) & 0xf];
2341 *p++ = hexdigit[(ch >> 20) & 0xf];
2342 *p++ = hexdigit[(ch >> 16) & 0xf];
2343 *p++ = hexdigit[(ch >> 12) & 0xf];
2344 *p++ = hexdigit[(ch >> 8) & 0xf];
2345 *p++ = hexdigit[(ch >> 4) & 0xf];
2346 *p++ = hexdigit[ch & 15];
Tim Petersced69f82003-09-16 20:30:58 +00002347 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002348 else
2349#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00002350 /* Map 16-bit characters to '\uxxxx' */
2351 if (ch >= 256) {
2352 *p++ = '\\';
2353 *p++ = 'u';
2354 *p++ = hexdigit[(ch >> 12) & 0xf];
2355 *p++ = hexdigit[(ch >> 8) & 0xf];
2356 *p++ = hexdigit[(ch >> 4) & 0xf];
2357 *p++ = hexdigit[ch & 15];
2358 }
2359 /* Copy everything else as-is */
2360 else
2361 *p++ = (char) ch;
2362 }
2363 *p = '\0';
Tim Peters5de98422002-04-27 18:44:32 +00002364 _PyString_Resize(&repr, p - q);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002365 return repr;
2366}
2367
2368PyObject *PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
2369{
2370 if (!PyUnicode_Check(unicode)) {
2371 PyErr_BadArgument();
2372 return NULL;
2373 }
2374 return PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
2375 PyUnicode_GET_SIZE(unicode));
2376}
2377
Walter Dörwalda47d1c02005-08-30 10:23:14 +00002378/* --- Unicode Internal Codec ------------------------------------------- */
2379
2380PyObject *_PyUnicode_DecodeUnicodeInternal(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002381 Py_ssize_t size,
Walter Dörwalda47d1c02005-08-30 10:23:14 +00002382 const char *errors)
2383{
2384 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002385 Py_ssize_t startinpos;
2386 Py_ssize_t endinpos;
2387 Py_ssize_t outpos;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00002388 PyUnicodeObject *v;
2389 Py_UNICODE *p;
2390 const char *end;
2391 const char *reason;
2392 PyObject *errorHandler = NULL;
2393 PyObject *exc = NULL;
2394
Neal Norwitzd43069c2006-01-08 01:12:10 +00002395#ifdef Py_UNICODE_WIDE
2396 Py_UNICODE unimax = PyUnicode_GetMax();
2397#endif
2398
Thomas Wouters89f507f2006-12-13 04:49:30 +00002399 /* XXX overflow detection missing */
Walter Dörwalda47d1c02005-08-30 10:23:14 +00002400 v = _PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE);
2401 if (v == NULL)
2402 goto onError;
2403 if (PyUnicode_GetSize((PyObject *)v) == 0)
2404 return (PyObject *)v;
2405 p = PyUnicode_AS_UNICODE(v);
2406 end = s + size;
2407
2408 while (s < end) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00002409 memcpy(p, s, sizeof(Py_UNICODE));
Walter Dörwalda47d1c02005-08-30 10:23:14 +00002410 /* We have to sanity check the raw data, otherwise doom looms for
2411 some malformed UCS-4 data. */
2412 if (
2413 #ifdef Py_UNICODE_WIDE
2414 *p > unimax || *p < 0 ||
2415 #endif
2416 end-s < Py_UNICODE_SIZE
2417 )
2418 {
2419 startinpos = s - starts;
2420 if (end-s < Py_UNICODE_SIZE) {
2421 endinpos = end-starts;
2422 reason = "truncated input";
2423 }
2424 else {
2425 endinpos = s - starts + Py_UNICODE_SIZE;
2426 reason = "illegal code point (> 0x10FFFF)";
2427 }
2428 outpos = p - PyUnicode_AS_UNICODE(v);
2429 if (unicode_decode_call_errorhandler(
2430 errors, &errorHandler,
2431 "unicode_internal", reason,
2432 starts, size, &startinpos, &endinpos, &exc, &s,
2433 (PyObject **)&v, &outpos, &p)) {
2434 goto onError;
2435 }
2436 }
2437 else {
2438 p++;
2439 s += Py_UNICODE_SIZE;
2440 }
2441 }
2442
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002443 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00002444 goto onError;
2445 Py_XDECREF(errorHandler);
2446 Py_XDECREF(exc);
2447 return (PyObject *)v;
2448
2449 onError:
2450 Py_XDECREF(v);
2451 Py_XDECREF(errorHandler);
2452 Py_XDECREF(exc);
2453 return NULL;
2454}
2455
Guido van Rossumd57fd912000-03-10 22:53:23 +00002456/* --- Latin-1 Codec ------------------------------------------------------ */
2457
2458PyObject *PyUnicode_DecodeLatin1(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002459 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00002460 const char *errors)
2461{
2462 PyUnicodeObject *v;
2463 Py_UNICODE *p;
Tim Petersced69f82003-09-16 20:30:58 +00002464
Guido van Rossumd57fd912000-03-10 22:53:23 +00002465 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002466 if (size == 1) {
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002467 Py_UNICODE r = *(unsigned char*)s;
2468 return PyUnicode_FromUnicode(&r, 1);
2469 }
2470
Guido van Rossumd57fd912000-03-10 22:53:23 +00002471 v = _PyUnicode_New(size);
2472 if (v == NULL)
2473 goto onError;
2474 if (size == 0)
2475 return (PyObject *)v;
2476 p = PyUnicode_AS_UNICODE(v);
2477 while (size-- > 0)
2478 *p++ = (unsigned char)*s++;
2479 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00002480
Guido van Rossumd57fd912000-03-10 22:53:23 +00002481 onError:
2482 Py_XDECREF(v);
2483 return NULL;
2484}
2485
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002486/* create or adjust a UnicodeEncodeError */
2487static void make_encode_exception(PyObject **exceptionObject,
2488 const char *encoding,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002489 const Py_UNICODE *unicode, Py_ssize_t size,
2490 Py_ssize_t startpos, Py_ssize_t endpos,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002491 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002492{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002493 if (*exceptionObject == NULL) {
2494 *exceptionObject = PyUnicodeEncodeError_Create(
2495 encoding, unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002496 }
2497 else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002498 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
2499 goto onError;
2500 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
2501 goto onError;
2502 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
2503 goto onError;
2504 return;
2505 onError:
2506 Py_DECREF(*exceptionObject);
2507 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002508 }
2509}
2510
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002511/* raises a UnicodeEncodeError */
2512static void raise_encode_exception(PyObject **exceptionObject,
2513 const char *encoding,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002514 const Py_UNICODE *unicode, Py_ssize_t size,
2515 Py_ssize_t startpos, Py_ssize_t endpos,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002516 const char *reason)
2517{
2518 make_encode_exception(exceptionObject,
2519 encoding, unicode, size, startpos, endpos, reason);
2520 if (*exceptionObject != NULL)
2521 PyCodec_StrictErrors(*exceptionObject);
2522}
2523
2524/* error handling callback helper:
2525 build arguments, call the callback and check the arguments,
2526 put the result into newpos and return the replacement string, which
2527 has to be freed by the caller */
2528static PyObject *unicode_encode_call_errorhandler(const char *errors,
2529 PyObject **errorHandler,
2530 const char *encoding, const char *reason,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002531 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
2532 Py_ssize_t startpos, Py_ssize_t endpos,
2533 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002534{
Martin v. Löwis18e16552006-02-15 17:27:45 +00002535 static char *argparse = "O!n;encoding error handler must return (unicode, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002536
2537 PyObject *restuple;
2538 PyObject *resunicode;
2539
2540 if (*errorHandler == NULL) {
2541 *errorHandler = PyCodec_LookupError(errors);
2542 if (*errorHandler == NULL)
2543 return NULL;
2544 }
2545
2546 make_encode_exception(exceptionObject,
2547 encoding, unicode, size, startpos, endpos, reason);
2548 if (*exceptionObject == NULL)
2549 return NULL;
2550
2551 restuple = PyObject_CallFunctionObjArgs(
2552 *errorHandler, *exceptionObject, NULL);
2553 if (restuple == NULL)
2554 return NULL;
2555 if (!PyTuple_Check(restuple)) {
2556 PyErr_Format(PyExc_TypeError, &argparse[4]);
2557 Py_DECREF(restuple);
2558 return NULL;
2559 }
2560 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
2561 &resunicode, newpos)) {
2562 Py_DECREF(restuple);
2563 return NULL;
2564 }
2565 if (*newpos<0)
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00002566 *newpos = size+*newpos;
2567 if (*newpos<0 || *newpos>size) {
Martin v. Löwis2c95cc62006-02-16 06:54:25 +00002568 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00002569 Py_DECREF(restuple);
2570 return NULL;
2571 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002572 Py_INCREF(resunicode);
2573 Py_DECREF(restuple);
2574 return resunicode;
2575}
2576
2577static PyObject *unicode_encode_ucs1(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002578 Py_ssize_t size,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002579 const char *errors,
2580 int limit)
2581{
2582 /* output object */
2583 PyObject *res;
2584 /* pointers to the beginning and end+1 of input */
2585 const Py_UNICODE *startp = p;
2586 const Py_UNICODE *endp = p + size;
2587 /* pointer to the beginning of the unencodable characters */
2588 /* const Py_UNICODE *badp = NULL; */
2589 /* pointer into the output */
2590 char *str;
2591 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00002592 Py_ssize_t respos = 0;
2593 Py_ssize_t ressize;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002594 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
2595 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002596 PyObject *errorHandler = NULL;
2597 PyObject *exc = NULL;
2598 /* the following variable is used for caching string comparisons
2599 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
2600 int known_errorHandler = -1;
2601
2602 /* allocate enough for a simple encoding without
2603 replacements, if we need more, we'll resize */
Guido van Rossumf15a29f2007-05-04 00:41:39 +00002604 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002605 if (res == NULL)
2606 goto onError;
2607 if (size == 0)
2608 return res;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00002609 str = PyBytes_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002610 ressize = size;
2611
2612 while (p<endp) {
2613 Py_UNICODE c = *p;
2614
2615 /* can we encode this? */
2616 if (c<limit) {
2617 /* no overflow check, because we know that the space is enough */
2618 *str++ = (char)c;
2619 ++p;
2620 }
2621 else {
Martin v. Löwis18e16552006-02-15 17:27:45 +00002622 Py_ssize_t unicodepos = p-startp;
2623 Py_ssize_t requiredsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002624 PyObject *repunicode;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002625 Py_ssize_t repsize;
2626 Py_ssize_t newpos;
2627 Py_ssize_t respos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002628 Py_UNICODE *uni2;
2629 /* startpos for collecting unencodable chars */
2630 const Py_UNICODE *collstart = p;
2631 const Py_UNICODE *collend = p;
2632 /* find all unecodable characters */
2633 while ((collend < endp) && ((*collend)>=limit))
2634 ++collend;
2635 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
2636 if (known_errorHandler==-1) {
2637 if ((errors==NULL) || (!strcmp(errors, "strict")))
2638 known_errorHandler = 1;
2639 else if (!strcmp(errors, "replace"))
2640 known_errorHandler = 2;
2641 else if (!strcmp(errors, "ignore"))
2642 known_errorHandler = 3;
2643 else if (!strcmp(errors, "xmlcharrefreplace"))
2644 known_errorHandler = 4;
2645 else
2646 known_errorHandler = 0;
2647 }
2648 switch (known_errorHandler) {
2649 case 1: /* strict */
2650 raise_encode_exception(&exc, encoding, startp, size, collstart-startp, collend-startp, reason);
2651 goto onError;
2652 case 2: /* replace */
2653 while (collstart++<collend)
2654 *str++ = '?'; /* fall through */
2655 case 3: /* ignore */
2656 p = collend;
2657 break;
2658 case 4: /* xmlcharrefreplace */
Guido van Rossumf15a29f2007-05-04 00:41:39 +00002659 respos = str - PyBytes_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002660 /* determine replacement size (temporarily (mis)uses p) */
2661 for (p = collstart, repsize = 0; p < collend; ++p) {
2662 if (*p<10)
2663 repsize += 2+1+1;
2664 else if (*p<100)
2665 repsize += 2+2+1;
2666 else if (*p<1000)
2667 repsize += 2+3+1;
2668 else if (*p<10000)
2669 repsize += 2+4+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00002670#ifndef Py_UNICODE_WIDE
2671 else
2672 repsize += 2+5+1;
2673#else
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002674 else if (*p<100000)
2675 repsize += 2+5+1;
2676 else if (*p<1000000)
2677 repsize += 2+6+1;
2678 else
2679 repsize += 2+7+1;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002680#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002681 }
2682 requiredsize = respos+repsize+(endp-collend);
2683 if (requiredsize > ressize) {
2684 if (requiredsize<2*ressize)
2685 requiredsize = 2*ressize;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00002686 if (PyBytes_Resize(res, requiredsize))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002687 goto onError;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00002688 str = PyBytes_AS_STRING(res) + respos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002689 ressize = requiredsize;
2690 }
2691 /* generate replacement (temporarily (mis)uses p) */
2692 for (p = collstart; p < collend; ++p) {
2693 str += sprintf(str, "&#%d;", (int)*p);
2694 }
2695 p = collend;
2696 break;
2697 default:
2698 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
2699 encoding, reason, startp, size, &exc,
2700 collstart-startp, collend-startp, &newpos);
2701 if (repunicode == NULL)
2702 goto onError;
2703 /* need more space? (at least enough for what we
2704 have+the replacement+the rest of the string, so
2705 we won't have to check space for encodable characters) */
Guido van Rossumf15a29f2007-05-04 00:41:39 +00002706 respos = str - PyBytes_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002707 repsize = PyUnicode_GET_SIZE(repunicode);
2708 requiredsize = respos+repsize+(endp-collend);
2709 if (requiredsize > ressize) {
2710 if (requiredsize<2*ressize)
2711 requiredsize = 2*ressize;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00002712 if (PyBytes_Resize(res, requiredsize)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002713 Py_DECREF(repunicode);
2714 goto onError;
2715 }
Guido van Rossumf15a29f2007-05-04 00:41:39 +00002716 str = PyBytes_AS_STRING(res) + respos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002717 ressize = requiredsize;
2718 }
2719 /* check if there is anything unencodable in the replacement
2720 and copy it to the output */
2721 for (uni2 = PyUnicode_AS_UNICODE(repunicode);repsize-->0; ++uni2, ++str) {
2722 c = *uni2;
2723 if (c >= limit) {
2724 raise_encode_exception(&exc, encoding, startp, size,
2725 unicodepos, unicodepos+1, reason);
2726 Py_DECREF(repunicode);
2727 goto onError;
2728 }
2729 *str = (char)c;
2730 }
2731 p = startp + newpos;
2732 Py_DECREF(repunicode);
2733 }
2734 }
2735 }
2736 /* Resize if we allocated to much */
Guido van Rossumf15a29f2007-05-04 00:41:39 +00002737 respos = str - PyBytes_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002738 if (respos<ressize)
2739 /* If this falls res will be NULL */
Guido van Rossumf15a29f2007-05-04 00:41:39 +00002740 PyBytes_Resize(res, respos);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002741 Py_XDECREF(errorHandler);
2742 Py_XDECREF(exc);
2743 return res;
2744
2745 onError:
2746 Py_XDECREF(res);
2747 Py_XDECREF(errorHandler);
2748 Py_XDECREF(exc);
2749 return NULL;
2750}
2751
Guido van Rossumd57fd912000-03-10 22:53:23 +00002752PyObject *PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002753 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00002754 const char *errors)
2755{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002756 return unicode_encode_ucs1(p, size, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002757}
2758
2759PyObject *PyUnicode_AsLatin1String(PyObject *unicode)
2760{
2761 if (!PyUnicode_Check(unicode)) {
2762 PyErr_BadArgument();
2763 return NULL;
2764 }
2765 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
2766 PyUnicode_GET_SIZE(unicode),
2767 NULL);
2768}
2769
2770/* --- 7-bit ASCII Codec -------------------------------------------------- */
2771
Guido van Rossumd57fd912000-03-10 22:53:23 +00002772PyObject *PyUnicode_DecodeASCII(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002773 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00002774 const char *errors)
2775{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002776 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002777 PyUnicodeObject *v;
2778 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002779 Py_ssize_t startinpos;
2780 Py_ssize_t endinpos;
2781 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002782 const char *e;
2783 PyObject *errorHandler = NULL;
2784 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00002785
Guido van Rossumd57fd912000-03-10 22:53:23 +00002786 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002787 if (size == 1 && *(unsigned char*)s < 128) {
2788 Py_UNICODE r = *(unsigned char*)s;
2789 return PyUnicode_FromUnicode(&r, 1);
2790 }
Tim Petersced69f82003-09-16 20:30:58 +00002791
Guido van Rossumd57fd912000-03-10 22:53:23 +00002792 v = _PyUnicode_New(size);
2793 if (v == NULL)
2794 goto onError;
2795 if (size == 0)
2796 return (PyObject *)v;
2797 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002798 e = s + size;
2799 while (s < e) {
2800 register unsigned char c = (unsigned char)*s;
2801 if (c < 128) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002802 *p++ = c;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002803 ++s;
2804 }
2805 else {
2806 startinpos = s-starts;
2807 endinpos = startinpos + 1;
Jeremy Hyltond8082792003-09-16 19:41:39 +00002808 outpos = p - (Py_UNICODE *)PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002809 if (unicode_decode_call_errorhandler(
2810 errors, &errorHandler,
2811 "ascii", "ordinal not in range(128)",
2812 starts, size, &startinpos, &endinpos, &exc, &s,
2813 (PyObject **)&v, &outpos, &p))
Guido van Rossumd57fd912000-03-10 22:53:23 +00002814 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002815 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002816 }
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002817 if (p - PyUnicode_AS_UNICODE(v) < PyString_GET_SIZE(v))
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002818 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002819 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002820 Py_XDECREF(errorHandler);
2821 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002822 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00002823
Guido van Rossumd57fd912000-03-10 22:53:23 +00002824 onError:
2825 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002826 Py_XDECREF(errorHandler);
2827 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002828 return NULL;
2829}
2830
Guido van Rossumd57fd912000-03-10 22:53:23 +00002831PyObject *PyUnicode_EncodeASCII(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002832 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00002833 const char *errors)
2834{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002835 return unicode_encode_ucs1(p, size, errors, 128);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002836}
2837
2838PyObject *PyUnicode_AsASCIIString(PyObject *unicode)
2839{
2840 if (!PyUnicode_Check(unicode)) {
2841 PyErr_BadArgument();
2842 return NULL;
2843 }
2844 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
2845 PyUnicode_GET_SIZE(unicode),
2846 NULL);
2847}
2848
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00002849#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Guido van Rossum2ea3e142000-03-31 17:24:09 +00002850
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002851/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00002852
Thomas Wouters0e3f5912006-08-11 14:57:12 +00002853#if SIZEOF_INT < SIZEOF_SSIZE_T
2854#define NEED_RETRY
2855#endif
2856
2857/* XXX This code is limited to "true" double-byte encodings, as
2858 a) it assumes an incomplete character consists of a single byte, and
2859 b) IsDBCSLeadByte (probably) does not work for non-DBCS multi-byte
2860 encodings, see IsDBCSLeadByteEx documentation. */
2861
2862static int is_dbcs_lead_byte(const char *s, int offset)
2863{
2864 const char *curr = s + offset;
2865
2866 if (IsDBCSLeadByte(*curr)) {
2867 const char *prev = CharPrev(s, curr);
2868 return (prev == curr) || !IsDBCSLeadByte(*prev) || (curr - prev == 2);
2869 }
2870 return 0;
2871}
2872
2873/*
2874 * Decode MBCS string into unicode object. If 'final' is set, converts
2875 * trailing lead-byte too. Returns consumed size if succeed, -1 otherwise.
2876 */
2877static int decode_mbcs(PyUnicodeObject **v,
2878 const char *s, /* MBCS string */
2879 int size, /* sizeof MBCS string */
2880 int final)
2881{
2882 Py_UNICODE *p;
2883 Py_ssize_t n = 0;
2884 int usize = 0;
2885
2886 assert(size >= 0);
2887
2888 /* Skip trailing lead-byte unless 'final' is set */
2889 if (!final && size >= 1 && is_dbcs_lead_byte(s, size - 1))
2890 --size;
2891
2892 /* First get the size of the result */
2893 if (size > 0) {
2894 usize = MultiByteToWideChar(CP_ACP, 0, s, size, NULL, 0);
2895 if (usize == 0) {
2896 PyErr_SetFromWindowsErrWithFilename(0, NULL);
2897 return -1;
2898 }
2899 }
2900
2901 if (*v == NULL) {
2902 /* Create unicode object */
2903 *v = _PyUnicode_New(usize);
2904 if (*v == NULL)
2905 return -1;
2906 }
2907 else {
2908 /* Extend unicode object */
2909 n = PyUnicode_GET_SIZE(*v);
2910 if (_PyUnicode_Resize(v, n + usize) < 0)
2911 return -1;
2912 }
2913
2914 /* Do the conversion */
2915 if (size > 0) {
2916 p = PyUnicode_AS_UNICODE(*v) + n;
2917 if (0 == MultiByteToWideChar(CP_ACP, 0, s, size, p, usize)) {
2918 PyErr_SetFromWindowsErrWithFilename(0, NULL);
2919 return -1;
2920 }
2921 }
2922
2923 return size;
2924}
2925
2926PyObject *PyUnicode_DecodeMBCSStateful(const char *s,
2927 Py_ssize_t size,
2928 const char *errors,
2929 Py_ssize_t *consumed)
2930{
2931 PyUnicodeObject *v = NULL;
2932 int done;
2933
2934 if (consumed)
2935 *consumed = 0;
2936
2937#ifdef NEED_RETRY
2938 retry:
2939 if (size > INT_MAX)
2940 done = decode_mbcs(&v, s, INT_MAX, 0);
2941 else
2942#endif
2943 done = decode_mbcs(&v, s, (int)size, !consumed);
2944
2945 if (done < 0) {
2946 Py_XDECREF(v);
2947 return NULL;
2948 }
2949
2950 if (consumed)
2951 *consumed += done;
2952
2953#ifdef NEED_RETRY
2954 if (size > INT_MAX) {
2955 s += done;
2956 size -= done;
2957 goto retry;
2958 }
2959#endif
2960
2961 return (PyObject *)v;
2962}
2963
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002964PyObject *PyUnicode_DecodeMBCS(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002965 Py_ssize_t size,
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002966 const char *errors)
2967{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00002968 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
2969}
2970
2971/*
2972 * Convert unicode into string object (MBCS).
2973 * Returns 0 if succeed, -1 otherwise.
2974 */
2975static int encode_mbcs(PyObject **repr,
2976 const Py_UNICODE *p, /* unicode */
2977 int size) /* size of unicode */
2978{
2979 int mbcssize = 0;
2980 Py_ssize_t n = 0;
2981
2982 assert(size >= 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002983
2984 /* First get the size of the result */
Thomas Wouters0e3f5912006-08-11 14:57:12 +00002985 if (size > 0) {
2986 mbcssize = WideCharToMultiByte(CP_ACP, 0, p, size, NULL, 0, NULL, NULL);
2987 if (mbcssize == 0) {
2988 PyErr_SetFromWindowsErrWithFilename(0, NULL);
2989 return -1;
2990 }
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002991 }
2992
Thomas Wouters0e3f5912006-08-11 14:57:12 +00002993 if (*repr == NULL) {
2994 /* Create string object */
Guido van Rossumf15a29f2007-05-04 00:41:39 +00002995 *repr = PyBytes_FromStringAndSize(NULL, mbcssize);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00002996 if (*repr == NULL)
2997 return -1;
2998 }
2999 else {
3000 /* Extend string object */
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003001 n = PyBytes_Size(*repr);
3002 if (PyBytes_Resize(*repr, n + mbcssize) < 0)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003003 return -1;
3004 }
3005
3006 /* Do the conversion */
3007 if (size > 0) {
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003008 char *s = PyBytes_AS_STRING(*repr) + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003009 if (0 == WideCharToMultiByte(CP_ACP, 0, p, size, s, mbcssize, NULL, NULL)) {
3010 PyErr_SetFromWindowsErrWithFilename(0, NULL);
3011 return -1;
3012 }
3013 }
3014
3015 return 0;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003016}
3017
3018PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003019 Py_ssize_t size,
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003020 const char *errors)
3021{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003022 PyObject *repr = NULL;
3023 int ret;
Guido van Rossum03e29f12000-05-04 15:52:20 +00003024
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003025#ifdef NEED_RETRY
3026 retry:
3027 if (size > INT_MAX)
3028 ret = encode_mbcs(&repr, p, INT_MAX);
3029 else
3030#endif
3031 ret = encode_mbcs(&repr, p, (int)size);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003032
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003033 if (ret < 0) {
3034 Py_XDECREF(repr);
3035 return NULL;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003036 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003037
3038#ifdef NEED_RETRY
3039 if (size > INT_MAX) {
3040 p += INT_MAX;
3041 size -= INT_MAX;
3042 goto retry;
3043 }
3044#endif
3045
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003046 return repr;
3047}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00003048
Mark Hammond0ccda1e2003-07-01 00:13:27 +00003049PyObject *PyUnicode_AsMBCSString(PyObject *unicode)
3050{
3051 if (!PyUnicode_Check(unicode)) {
3052 PyErr_BadArgument();
3053 return NULL;
3054 }
3055 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
3056 PyUnicode_GET_SIZE(unicode),
3057 NULL);
3058}
3059
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003060#undef NEED_RETRY
3061
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00003062#endif /* MS_WINDOWS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003063
Guido van Rossumd57fd912000-03-10 22:53:23 +00003064/* --- Character Mapping Codec -------------------------------------------- */
3065
Guido van Rossumd57fd912000-03-10 22:53:23 +00003066PyObject *PyUnicode_DecodeCharmap(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003067 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003068 PyObject *mapping,
3069 const char *errors)
3070{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003071 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003072 Py_ssize_t startinpos;
3073 Py_ssize_t endinpos;
3074 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003075 const char *e;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003076 PyUnicodeObject *v;
3077 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003078 Py_ssize_t extrachars = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003079 PyObject *errorHandler = NULL;
3080 PyObject *exc = NULL;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003081 Py_UNICODE *mapstring = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003082 Py_ssize_t maplen = 0;
Tim Petersced69f82003-09-16 20:30:58 +00003083
Guido van Rossumd57fd912000-03-10 22:53:23 +00003084 /* Default to Latin-1 */
3085 if (mapping == NULL)
3086 return PyUnicode_DecodeLatin1(s, size, errors);
3087
3088 v = _PyUnicode_New(size);
3089 if (v == NULL)
3090 goto onError;
3091 if (size == 0)
3092 return (PyObject *)v;
3093 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003094 e = s + size;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003095 if (PyUnicode_CheckExact(mapping)) {
3096 mapstring = PyUnicode_AS_UNICODE(mapping);
3097 maplen = PyUnicode_GET_SIZE(mapping);
3098 while (s < e) {
3099 unsigned char ch = *s;
3100 Py_UNICODE x = 0xfffe; /* illegal value */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003101
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003102 if (ch < maplen)
3103 x = mapstring[ch];
Guido van Rossumd57fd912000-03-10 22:53:23 +00003104
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003105 if (x == 0xfffe) {
3106 /* undefined mapping */
3107 outpos = p-PyUnicode_AS_UNICODE(v);
3108 startinpos = s-starts;
3109 endinpos = startinpos+1;
3110 if (unicode_decode_call_errorhandler(
3111 errors, &errorHandler,
3112 "charmap", "character maps to <undefined>",
3113 starts, size, &startinpos, &endinpos, &exc, &s,
3114 (PyObject **)&v, &outpos, &p)) {
3115 goto onError;
Marc-André Lemburgec233e52001-01-06 14:59:58 +00003116 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003117 continue;
Marc-André Lemburgec233e52001-01-06 14:59:58 +00003118 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003119 *p++ = x;
3120 ++s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003121 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003122 }
3123 else {
3124 while (s < e) {
3125 unsigned char ch = *s;
3126 PyObject *w, *x;
3127
3128 /* Get mapping (char ordinal -> integer, Unicode char or None) */
3129 w = PyInt_FromLong((long)ch);
3130 if (w == NULL)
3131 goto onError;
3132 x = PyObject_GetItem(mapping, w);
3133 Py_DECREF(w);
3134 if (x == NULL) {
3135 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
3136 /* No mapping found means: mapping is undefined. */
3137 PyErr_Clear();
3138 x = Py_None;
3139 Py_INCREF(x);
3140 } else
3141 goto onError;
3142 }
3143
3144 /* Apply mapping */
3145 if (PyInt_Check(x)) {
3146 long value = PyInt_AS_LONG(x);
3147 if (value < 0 || value > 65535) {
3148 PyErr_SetString(PyExc_TypeError,
3149 "character mapping must be in range(65536)");
3150 Py_DECREF(x);
3151 goto onError;
3152 }
3153 *p++ = (Py_UNICODE)value;
3154 }
3155 else if (x == Py_None) {
3156 /* undefined mapping */
3157 outpos = p-PyUnicode_AS_UNICODE(v);
3158 startinpos = s-starts;
3159 endinpos = startinpos+1;
3160 if (unicode_decode_call_errorhandler(
3161 errors, &errorHandler,
3162 "charmap", "character maps to <undefined>",
3163 starts, size, &startinpos, &endinpos, &exc, &s,
3164 (PyObject **)&v, &outpos, &p)) {
3165 Py_DECREF(x);
3166 goto onError;
3167 }
Walter Dörwaldd4fff172005-11-28 22:15:56 +00003168 Py_DECREF(x);
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003169 continue;
3170 }
3171 else if (PyUnicode_Check(x)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00003172 Py_ssize_t targetsize = PyUnicode_GET_SIZE(x);
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003173
3174 if (targetsize == 1)
3175 /* 1-1 mapping */
3176 *p++ = *PyUnicode_AS_UNICODE(x);
3177
3178 else if (targetsize > 1) {
3179 /* 1-n mapping */
3180 if (targetsize > extrachars) {
3181 /* resize first */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003182 Py_ssize_t oldpos = p - PyUnicode_AS_UNICODE(v);
3183 Py_ssize_t needed = (targetsize - extrachars) + \
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003184 (targetsize << 2);
3185 extrachars += needed;
Thomas Wouters89f507f2006-12-13 04:49:30 +00003186 /* XXX overflow detection missing */
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003187 if (_PyUnicode_Resize(&v,
3188 PyUnicode_GET_SIZE(v) + needed) < 0) {
3189 Py_DECREF(x);
3190 goto onError;
3191 }
3192 p = PyUnicode_AS_UNICODE(v) + oldpos;
3193 }
3194 Py_UNICODE_COPY(p,
3195 PyUnicode_AS_UNICODE(x),
3196 targetsize);
3197 p += targetsize;
3198 extrachars -= targetsize;
3199 }
3200 /* 1-0 mapping: skip the character */
3201 }
3202 else {
3203 /* wrong return value */
3204 PyErr_SetString(PyExc_TypeError,
3205 "character mapping must return integer, None or unicode");
3206 Py_DECREF(x);
3207 goto onError;
3208 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003209 Py_DECREF(x);
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003210 ++s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003211 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003212 }
3213 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003214 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003215 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003216 Py_XDECREF(errorHandler);
3217 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003218 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00003219
Guido van Rossumd57fd912000-03-10 22:53:23 +00003220 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003221 Py_XDECREF(errorHandler);
3222 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003223 Py_XDECREF(v);
3224 return NULL;
3225}
3226
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003227/* Charmap encoding: the lookup table */
3228
3229struct encoding_map{
3230 PyObject_HEAD
3231 unsigned char level1[32];
3232 int count2, count3;
3233 unsigned char level23[1];
3234};
3235
3236static PyObject*
3237encoding_map_size(PyObject *obj, PyObject* args)
3238{
3239 struct encoding_map *map = (struct encoding_map*)obj;
3240 return PyInt_FromLong(sizeof(*map) - 1 + 16*map->count2 +
3241 128*map->count3);
3242}
3243
3244static PyMethodDef encoding_map_methods[] = {
3245 {"size", encoding_map_size, METH_NOARGS,
3246 PyDoc_STR("Return the size (in bytes) of this object") },
3247 { 0 }
3248};
3249
3250static void
3251encoding_map_dealloc(PyObject* o)
3252{
3253 PyObject_FREE(o);
3254}
3255
3256static PyTypeObject EncodingMapType = {
3257 PyObject_HEAD_INIT(NULL)
3258 0, /*ob_size*/
3259 "EncodingMap", /*tp_name*/
3260 sizeof(struct encoding_map), /*tp_basicsize*/
3261 0, /*tp_itemsize*/
3262 /* methods */
3263 encoding_map_dealloc, /*tp_dealloc*/
3264 0, /*tp_print*/
3265 0, /*tp_getattr*/
3266 0, /*tp_setattr*/
3267 0, /*tp_compare*/
3268 0, /*tp_repr*/
3269 0, /*tp_as_number*/
3270 0, /*tp_as_sequence*/
3271 0, /*tp_as_mapping*/
3272 0, /*tp_hash*/
3273 0, /*tp_call*/
3274 0, /*tp_str*/
3275 0, /*tp_getattro*/
3276 0, /*tp_setattro*/
3277 0, /*tp_as_buffer*/
3278 Py_TPFLAGS_DEFAULT, /*tp_flags*/
3279 0, /*tp_doc*/
3280 0, /*tp_traverse*/
3281 0, /*tp_clear*/
3282 0, /*tp_richcompare*/
3283 0, /*tp_weaklistoffset*/
3284 0, /*tp_iter*/
3285 0, /*tp_iternext*/
3286 encoding_map_methods, /*tp_methods*/
3287 0, /*tp_members*/
3288 0, /*tp_getset*/
3289 0, /*tp_base*/
3290 0, /*tp_dict*/
3291 0, /*tp_descr_get*/
3292 0, /*tp_descr_set*/
3293 0, /*tp_dictoffset*/
3294 0, /*tp_init*/
3295 0, /*tp_alloc*/
3296 0, /*tp_new*/
3297 0, /*tp_free*/
3298 0, /*tp_is_gc*/
3299};
3300
3301PyObject*
3302PyUnicode_BuildEncodingMap(PyObject* string)
3303{
3304 Py_UNICODE *decode;
3305 PyObject *result;
3306 struct encoding_map *mresult;
3307 int i;
3308 int need_dict = 0;
3309 unsigned char level1[32];
3310 unsigned char level2[512];
3311 unsigned char *mlevel1, *mlevel2, *mlevel3;
3312 int count2 = 0, count3 = 0;
3313
3314 if (!PyUnicode_Check(string) || PyUnicode_GetSize(string) != 256) {
3315 PyErr_BadArgument();
3316 return NULL;
3317 }
3318 decode = PyUnicode_AS_UNICODE(string);
3319 memset(level1, 0xFF, sizeof level1);
3320 memset(level2, 0xFF, sizeof level2);
3321
3322 /* If there isn't a one-to-one mapping of NULL to \0,
3323 or if there are non-BMP characters, we need to use
3324 a mapping dictionary. */
3325 if (decode[0] != 0)
3326 need_dict = 1;
3327 for (i = 1; i < 256; i++) {
3328 int l1, l2;
3329 if (decode[i] == 0
3330 #ifdef Py_UNICODE_WIDE
3331 || decode[i] > 0xFFFF
3332 #endif
3333 ) {
3334 need_dict = 1;
3335 break;
3336 }
3337 if (decode[i] == 0xFFFE)
3338 /* unmapped character */
3339 continue;
3340 l1 = decode[i] >> 11;
3341 l2 = decode[i] >> 7;
3342 if (level1[l1] == 0xFF)
3343 level1[l1] = count2++;
3344 if (level2[l2] == 0xFF)
3345 level2[l2] = count3++;
3346 }
3347
3348 if (count2 >= 0xFF || count3 >= 0xFF)
3349 need_dict = 1;
3350
3351 if (need_dict) {
3352 PyObject *result = PyDict_New();
3353 PyObject *key, *value;
3354 if (!result)
3355 return NULL;
3356 for (i = 0; i < 256; i++) {
3357 key = value = NULL;
3358 key = PyInt_FromLong(decode[i]);
3359 value = PyInt_FromLong(i);
3360 if (!key || !value)
3361 goto failed1;
3362 if (PyDict_SetItem(result, key, value) == -1)
3363 goto failed1;
3364 Py_DECREF(key);
3365 Py_DECREF(value);
3366 }
3367 return result;
3368 failed1:
3369 Py_XDECREF(key);
3370 Py_XDECREF(value);
3371 Py_DECREF(result);
3372 return NULL;
3373 }
3374
3375 /* Create a three-level trie */
3376 result = PyObject_MALLOC(sizeof(struct encoding_map) +
3377 16*count2 + 128*count3 - 1);
3378 if (!result)
3379 return PyErr_NoMemory();
3380 PyObject_Init(result, &EncodingMapType);
3381 mresult = (struct encoding_map*)result;
3382 mresult->count2 = count2;
3383 mresult->count3 = count3;
3384 mlevel1 = mresult->level1;
3385 mlevel2 = mresult->level23;
3386 mlevel3 = mresult->level23 + 16*count2;
3387 memcpy(mlevel1, level1, 32);
3388 memset(mlevel2, 0xFF, 16*count2);
3389 memset(mlevel3, 0, 128*count3);
3390 count3 = 0;
3391 for (i = 1; i < 256; i++) {
3392 int o1, o2, o3, i2, i3;
3393 if (decode[i] == 0xFFFE)
3394 /* unmapped character */
3395 continue;
3396 o1 = decode[i]>>11;
3397 o2 = (decode[i]>>7) & 0xF;
3398 i2 = 16*mlevel1[o1] + o2;
3399 if (mlevel2[i2] == 0xFF)
3400 mlevel2[i2] = count3++;
3401 o3 = decode[i] & 0x7F;
3402 i3 = 128*mlevel2[i2] + o3;
3403 mlevel3[i3] = i;
3404 }
3405 return result;
3406}
3407
3408static int
3409encoding_map_lookup(Py_UNICODE c, PyObject *mapping)
3410{
3411 struct encoding_map *map = (struct encoding_map*)mapping;
3412 int l1 = c>>11;
3413 int l2 = (c>>7) & 0xF;
3414 int l3 = c & 0x7F;
3415 int i;
3416
3417#ifdef Py_UNICODE_WIDE
3418 if (c > 0xFFFF) {
3419 return -1;
3420 }
3421#endif
3422 if (c == 0)
3423 return 0;
3424 /* level 1*/
3425 i = map->level1[l1];
3426 if (i == 0xFF) {
3427 return -1;
3428 }
3429 /* level 2*/
3430 i = map->level23[16*i+l2];
3431 if (i == 0xFF) {
3432 return -1;
3433 }
3434 /* level 3 */
3435 i = map->level23[16*map->count2 + 128*i + l3];
3436 if (i == 0) {
3437 return -1;
3438 }
3439 return i;
3440}
3441
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003442/* Lookup the character ch in the mapping. If the character
3443 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00003444 error occurred). */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003445static PyObject *charmapencode_lookup(Py_UNICODE c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003446{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003447 PyObject *w = PyInt_FromLong((long)c);
3448 PyObject *x;
3449
3450 if (w == NULL)
3451 return NULL;
3452 x = PyObject_GetItem(mapping, w);
3453 Py_DECREF(w);
3454 if (x == NULL) {
3455 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
3456 /* No mapping found means: mapping is undefined. */
3457 PyErr_Clear();
3458 x = Py_None;
3459 Py_INCREF(x);
3460 return x;
3461 } else
3462 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003463 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00003464 else if (x == Py_None)
3465 return x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003466 else if (PyInt_Check(x)) {
3467 long value = PyInt_AS_LONG(x);
3468 if (value < 0 || value > 255) {
3469 PyErr_SetString(PyExc_TypeError,
3470 "character mapping must be in range(256)");
3471 Py_DECREF(x);
3472 return NULL;
3473 }
3474 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003475 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003476 else if (PyString_Check(x))
3477 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003478 else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003479 /* wrong return value */
3480 PyErr_SetString(PyExc_TypeError,
3481 "character mapping must return integer, None or str");
3482 Py_DECREF(x);
3483 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003484 }
3485}
3486
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003487static int
3488charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
3489{
3490 Py_ssize_t outsize = PyString_GET_SIZE(*outobj);
3491 /* exponentially overallocate to minimize reallocations */
3492 if (requiredsize < 2*outsize)
3493 requiredsize = 2*outsize;
3494 if (_PyString_Resize(outobj, requiredsize)) {
3495 return 0;
3496 }
3497 return 1;
3498}
3499
3500typedef enum charmapencode_result {
3501 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
3502}charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003503/* lookup the character, put the result in the output string and adjust
3504 various state variables. Reallocate the output string if not enough
3505 space is available. Return a new reference to the object that
3506 was put in the output buffer, or Py_None, if the mapping was undefined
3507 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00003508 reallocation error occurred. The caller must decref the result */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003509static
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003510charmapencode_result charmapencode_output(Py_UNICODE c, PyObject *mapping,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003511 PyObject **outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003512{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003513 PyObject *rep;
3514 char *outstart;
3515 Py_ssize_t outsize = PyString_GET_SIZE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003516
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003517 if (mapping->ob_type == &EncodingMapType) {
3518 int res = encoding_map_lookup(c, mapping);
3519 Py_ssize_t requiredsize = *outpos+1;
3520 if (res == -1)
3521 return enc_FAILED;
3522 if (outsize<requiredsize)
3523 if (!charmapencode_resize(outobj, outpos, requiredsize))
3524 return enc_EXCEPTION;
3525 outstart = PyString_AS_STRING(*outobj);
3526 outstart[(*outpos)++] = (char)res;
3527 return enc_SUCCESS;
3528 }
3529
3530 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003531 if (rep==NULL)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003532 return enc_EXCEPTION;
3533 else if (rep==Py_None) {
3534 Py_DECREF(rep);
3535 return enc_FAILED;
3536 } else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003537 if (PyInt_Check(rep)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00003538 Py_ssize_t requiredsize = *outpos+1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003539 if (outsize<requiredsize)
3540 if (!charmapencode_resize(outobj, outpos, requiredsize)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003541 Py_DECREF(rep);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003542 return enc_EXCEPTION;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003543 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003544 outstart = PyString_AS_STRING(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003545 outstart[(*outpos)++] = (char)PyInt_AS_LONG(rep);
3546 }
3547 else {
3548 const char *repchars = PyString_AS_STRING(rep);
Martin v. Löwis18e16552006-02-15 17:27:45 +00003549 Py_ssize_t repsize = PyString_GET_SIZE(rep);
3550 Py_ssize_t requiredsize = *outpos+repsize;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003551 if (outsize<requiredsize)
3552 if (!charmapencode_resize(outobj, outpos, requiredsize)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003553 Py_DECREF(rep);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003554 return enc_EXCEPTION;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003555 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003556 outstart = PyString_AS_STRING(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003557 memcpy(outstart + *outpos, repchars, repsize);
3558 *outpos += repsize;
3559 }
3560 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003561 Py_DECREF(rep);
3562 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003563}
3564
3565/* handle an error in PyUnicode_EncodeCharmap
3566 Return 0 on success, -1 on error */
3567static
3568int charmap_encoding_error(
Martin v. Löwis18e16552006-02-15 17:27:45 +00003569 const Py_UNICODE *p, Py_ssize_t size, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003570 PyObject **exceptionObject,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00003571 int *known_errorHandler, PyObject **errorHandler, const char *errors,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003572 PyObject **res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003573{
3574 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003575 Py_ssize_t repsize;
3576 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003577 Py_UNICODE *uni2;
3578 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003579 Py_ssize_t collstartpos = *inpos;
3580 Py_ssize_t collendpos = *inpos+1;
3581 Py_ssize_t collpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003582 char *encoding = "charmap";
3583 char *reason = "character maps to <undefined>";
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003584 charmapencode_result x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003585
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003586 /* find all unencodable characters */
3587 while (collendpos < size) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003588 PyObject *rep;
3589 if (mapping->ob_type == &EncodingMapType) {
3590 int res = encoding_map_lookup(p[collendpos], mapping);
3591 if (res != -1)
3592 break;
3593 ++collendpos;
3594 continue;
3595 }
3596
3597 rep = charmapencode_lookup(p[collendpos], mapping);
3598 if (rep==NULL)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003599 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003600 else if (rep!=Py_None) {
3601 Py_DECREF(rep);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003602 break;
3603 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003604 Py_DECREF(rep);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003605 ++collendpos;
3606 }
3607 /* cache callback name lookup
3608 * (if not done yet, i.e. it's the first error) */
3609 if (*known_errorHandler==-1) {
3610 if ((errors==NULL) || (!strcmp(errors, "strict")))
3611 *known_errorHandler = 1;
3612 else if (!strcmp(errors, "replace"))
3613 *known_errorHandler = 2;
3614 else if (!strcmp(errors, "ignore"))
3615 *known_errorHandler = 3;
3616 else if (!strcmp(errors, "xmlcharrefreplace"))
3617 *known_errorHandler = 4;
3618 else
3619 *known_errorHandler = 0;
3620 }
3621 switch (*known_errorHandler) {
3622 case 1: /* strict */
3623 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
3624 return -1;
3625 case 2: /* replace */
3626 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
3627 x = charmapencode_output('?', mapping, res, respos);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003628 if (x==enc_EXCEPTION) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003629 return -1;
3630 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003631 else if (x==enc_FAILED) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003632 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
3633 return -1;
3634 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003635 }
3636 /* fall through */
3637 case 3: /* ignore */
3638 *inpos = collendpos;
3639 break;
3640 case 4: /* xmlcharrefreplace */
3641 /* generate replacement (temporarily (mis)uses p) */
3642 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
3643 char buffer[2+29+1+1];
3644 char *cp;
3645 sprintf(buffer, "&#%d;", (int)p[collpos]);
3646 for (cp = buffer; *cp; ++cp) {
3647 x = charmapencode_output(*cp, mapping, res, respos);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003648 if (x==enc_EXCEPTION)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003649 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003650 else if (x==enc_FAILED) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003651 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
3652 return -1;
3653 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003654 }
3655 }
3656 *inpos = collendpos;
3657 break;
3658 default:
Walter Dörwalde5402fb2003-08-14 20:25:29 +00003659 repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003660 encoding, reason, p, size, exceptionObject,
3661 collstartpos, collendpos, &newpos);
3662 if (repunicode == NULL)
3663 return -1;
3664 /* generate replacement */
3665 repsize = PyUnicode_GET_SIZE(repunicode);
3666 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
3667 x = charmapencode_output(*uni2, mapping, res, respos);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003668 if (x==enc_EXCEPTION) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003669 return -1;
3670 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003671 else if (x==enc_FAILED) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003672 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003673 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
3674 return -1;
3675 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003676 }
3677 *inpos = newpos;
3678 Py_DECREF(repunicode);
3679 }
3680 return 0;
3681}
3682
Guido van Rossumd57fd912000-03-10 22:53:23 +00003683PyObject *PyUnicode_EncodeCharmap(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003684 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003685 PyObject *mapping,
3686 const char *errors)
3687{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003688 /* output object */
3689 PyObject *res = NULL;
3690 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003691 Py_ssize_t inpos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003692 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003693 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003694 PyObject *errorHandler = NULL;
3695 PyObject *exc = NULL;
3696 /* the following variable is used for caching string comparisons
3697 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
3698 * 3=ignore, 4=xmlcharrefreplace */
3699 int known_errorHandler = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003700
3701 /* Default to Latin-1 */
3702 if (mapping == NULL)
3703 return PyUnicode_EncodeLatin1(p, size, errors);
3704
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003705 /* allocate enough for a simple encoding without
3706 replacements, if we need more, we'll resize */
3707 res = PyString_FromStringAndSize(NULL, size);
3708 if (res == NULL)
3709 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00003710 if (size == 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003711 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003712
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003713 while (inpos<size) {
3714 /* try to encode it */
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003715 charmapencode_result x = charmapencode_output(p[inpos], mapping, &res, &respos);
3716 if (x==enc_EXCEPTION) /* error */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003717 goto onError;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003718 if (x==enc_FAILED) { /* unencodable character */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003719 if (charmap_encoding_error(p, size, &inpos, mapping,
3720 &exc,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00003721 &known_errorHandler, &errorHandler, errors,
Walter Dörwald9b30f202003-08-15 16:26:34 +00003722 &res, &respos)) {
Marc-André Lemburg3a645e42001-01-16 11:54:12 +00003723 goto onError;
Walter Dörwald9b30f202003-08-15 16:26:34 +00003724 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003725 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003726 else
3727 /* done with this character => adjust input position */
3728 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003729 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003730
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003731 /* Resize if we allocated to much */
3732 if (respos<PyString_GET_SIZE(res)) {
3733 if (_PyString_Resize(&res, respos))
3734 goto onError;
3735 }
3736 Py_XDECREF(exc);
3737 Py_XDECREF(errorHandler);
3738 return res;
3739
3740 onError:
3741 Py_XDECREF(res);
3742 Py_XDECREF(exc);
3743 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003744 return NULL;
3745}
3746
3747PyObject *PyUnicode_AsCharmapString(PyObject *unicode,
3748 PyObject *mapping)
3749{
3750 if (!PyUnicode_Check(unicode) || mapping == NULL) {
3751 PyErr_BadArgument();
3752 return NULL;
3753 }
3754 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
3755 PyUnicode_GET_SIZE(unicode),
3756 mapping,
3757 NULL);
3758}
3759
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003760/* create or adjust a UnicodeTranslateError */
3761static void make_translate_exception(PyObject **exceptionObject,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003762 const Py_UNICODE *unicode, Py_ssize_t size,
3763 Py_ssize_t startpos, Py_ssize_t endpos,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003764 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003765{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003766 if (*exceptionObject == NULL) {
3767 *exceptionObject = PyUnicodeTranslateError_Create(
3768 unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003769 }
3770 else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003771 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
3772 goto onError;
3773 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
3774 goto onError;
3775 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
3776 goto onError;
3777 return;
3778 onError:
3779 Py_DECREF(*exceptionObject);
3780 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003781 }
3782}
3783
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003784/* raises a UnicodeTranslateError */
3785static void raise_translate_exception(PyObject **exceptionObject,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003786 const Py_UNICODE *unicode, Py_ssize_t size,
3787 Py_ssize_t startpos, Py_ssize_t endpos,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003788 const char *reason)
3789{
3790 make_translate_exception(exceptionObject,
3791 unicode, size, startpos, endpos, reason);
3792 if (*exceptionObject != NULL)
3793 PyCodec_StrictErrors(*exceptionObject);
3794}
3795
3796/* error handling callback helper:
3797 build arguments, call the callback and check the arguments,
3798 put the result into newpos and return the replacement string, which
3799 has to be freed by the caller */
3800static PyObject *unicode_translate_call_errorhandler(const char *errors,
3801 PyObject **errorHandler,
3802 const char *reason,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003803 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
3804 Py_ssize_t startpos, Py_ssize_t endpos,
3805 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003806{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003807 static char *argparse = "O!n;translating error handler must return (unicode, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003808
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003809 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003810 PyObject *restuple;
3811 PyObject *resunicode;
3812
3813 if (*errorHandler == NULL) {
3814 *errorHandler = PyCodec_LookupError(errors);
3815 if (*errorHandler == NULL)
3816 return NULL;
3817 }
3818
3819 make_translate_exception(exceptionObject,
3820 unicode, size, startpos, endpos, reason);
3821 if (*exceptionObject == NULL)
3822 return NULL;
3823
3824 restuple = PyObject_CallFunctionObjArgs(
3825 *errorHandler, *exceptionObject, NULL);
3826 if (restuple == NULL)
3827 return NULL;
3828 if (!PyTuple_Check(restuple)) {
3829 PyErr_Format(PyExc_TypeError, &argparse[4]);
3830 Py_DECREF(restuple);
3831 return NULL;
3832 }
3833 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003834 &resunicode, &i_newpos)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003835 Py_DECREF(restuple);
3836 return NULL;
3837 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00003838 if (i_newpos<0)
3839 *newpos = size+i_newpos;
3840 else
3841 *newpos = i_newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003842 if (*newpos<0 || *newpos>size) {
Martin v. Löwis2c95cc62006-02-16 06:54:25 +00003843 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003844 Py_DECREF(restuple);
3845 return NULL;
3846 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003847 Py_INCREF(resunicode);
3848 Py_DECREF(restuple);
3849 return resunicode;
3850}
3851
3852/* Lookup the character ch in the mapping and put the result in result,
3853 which must be decrefed by the caller.
3854 Return 0 on success, -1 on error */
3855static
3856int charmaptranslate_lookup(Py_UNICODE c, PyObject *mapping, PyObject **result)
3857{
3858 PyObject *w = PyInt_FromLong((long)c);
3859 PyObject *x;
3860
3861 if (w == NULL)
3862 return -1;
3863 x = PyObject_GetItem(mapping, w);
3864 Py_DECREF(w);
3865 if (x == NULL) {
3866 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
3867 /* No mapping found means: use 1:1 mapping. */
3868 PyErr_Clear();
3869 *result = NULL;
3870 return 0;
3871 } else
3872 return -1;
3873 }
3874 else if (x == Py_None) {
3875 *result = x;
3876 return 0;
3877 }
3878 else if (PyInt_Check(x)) {
3879 long value = PyInt_AS_LONG(x);
3880 long max = PyUnicode_GetMax();
3881 if (value < 0 || value > max) {
3882 PyErr_Format(PyExc_TypeError,
3883 "character mapping must be in range(0x%lx)", max+1);
3884 Py_DECREF(x);
3885 return -1;
3886 }
3887 *result = x;
3888 return 0;
3889 }
3890 else if (PyUnicode_Check(x)) {
3891 *result = x;
3892 return 0;
3893 }
3894 else {
3895 /* wrong return value */
3896 PyErr_SetString(PyExc_TypeError,
3897 "character mapping must return integer, None or unicode");
Walter Dörwald150523e2003-08-15 16:52:19 +00003898 Py_DECREF(x);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003899 return -1;
3900 }
3901}
3902/* ensure that *outobj is at least requiredsize characters long,
3903if not reallocate and adjust various state variables.
3904Return 0 on success, -1 on error */
3905static
Walter Dörwald4894c302003-10-24 14:25:28 +00003906int charmaptranslate_makespace(PyObject **outobj, Py_UNICODE **outp,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003907 Py_ssize_t requiredsize)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003908{
Martin v. Löwis18e16552006-02-15 17:27:45 +00003909 Py_ssize_t oldsize = PyUnicode_GET_SIZE(*outobj);
Walter Dörwald4894c302003-10-24 14:25:28 +00003910 if (requiredsize > oldsize) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003911 /* remember old output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003912 Py_ssize_t outpos = *outp-PyUnicode_AS_UNICODE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003913 /* exponentially overallocate to minimize reallocations */
Walter Dörwald4894c302003-10-24 14:25:28 +00003914 if (requiredsize < 2 * oldsize)
3915 requiredsize = 2 * oldsize;
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00003916 if (_PyUnicode_Resize(outobj, requiredsize) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003917 return -1;
3918 *outp = PyUnicode_AS_UNICODE(*outobj) + outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003919 }
3920 return 0;
3921}
3922/* lookup the character, put the result in the output string and adjust
3923 various state variables. Return a new reference to the object that
3924 was put in the output buffer in *result, or Py_None, if the mapping was
3925 undefined (in which case no character was written).
3926 The called must decref result.
3927 Return 0 on success, -1 on error. */
3928static
Walter Dörwald4894c302003-10-24 14:25:28 +00003929int charmaptranslate_output(const Py_UNICODE *startinp, const Py_UNICODE *curinp,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003930 Py_ssize_t insize, PyObject *mapping, PyObject **outobj, Py_UNICODE **outp,
Walter Dörwald4894c302003-10-24 14:25:28 +00003931 PyObject **res)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003932{
Walter Dörwald4894c302003-10-24 14:25:28 +00003933 if (charmaptranslate_lookup(*curinp, mapping, res))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003934 return -1;
3935 if (*res==NULL) {
3936 /* not found => default to 1:1 mapping */
Walter Dörwald4894c302003-10-24 14:25:28 +00003937 *(*outp)++ = *curinp;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003938 }
3939 else if (*res==Py_None)
3940 ;
3941 else if (PyInt_Check(*res)) {
3942 /* no overflow check, because we know that the space is enough */
3943 *(*outp)++ = (Py_UNICODE)PyInt_AS_LONG(*res);
3944 }
3945 else if (PyUnicode_Check(*res)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00003946 Py_ssize_t repsize = PyUnicode_GET_SIZE(*res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003947 if (repsize==1) {
3948 /* no overflow check, because we know that the space is enough */
3949 *(*outp)++ = *PyUnicode_AS_UNICODE(*res);
3950 }
3951 else if (repsize!=0) {
3952 /* more than one character */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003953 Py_ssize_t requiredsize = (*outp-PyUnicode_AS_UNICODE(*outobj)) +
Walter Dörwaldcd736e72004-02-05 17:36:00 +00003954 (insize - (curinp-startinp)) +
Walter Dörwald4894c302003-10-24 14:25:28 +00003955 repsize - 1;
3956 if (charmaptranslate_makespace(outobj, outp, requiredsize))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003957 return -1;
3958 memcpy(*outp, PyUnicode_AS_UNICODE(*res), sizeof(Py_UNICODE)*repsize);
3959 *outp += repsize;
3960 }
3961 }
3962 else
3963 return -1;
3964 return 0;
3965}
3966
3967PyObject *PyUnicode_TranslateCharmap(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003968 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003969 PyObject *mapping,
3970 const char *errors)
3971{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003972 /* output object */
3973 PyObject *res = NULL;
3974 /* pointers to the beginning and end+1 of input */
3975 const Py_UNICODE *startp = p;
3976 const Py_UNICODE *endp = p + size;
3977 /* pointer into the output */
3978 Py_UNICODE *str;
3979 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003980 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003981 char *reason = "character maps to <undefined>";
3982 PyObject *errorHandler = NULL;
3983 PyObject *exc = NULL;
3984 /* the following variable is used for caching string comparisons
3985 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
3986 * 3=ignore, 4=xmlcharrefreplace */
3987 int known_errorHandler = -1;
3988
Guido van Rossumd57fd912000-03-10 22:53:23 +00003989 if (mapping == NULL) {
3990 PyErr_BadArgument();
3991 return NULL;
3992 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003993
3994 /* allocate enough for a simple 1:1 translation without
3995 replacements, if we need more, we'll resize */
3996 res = PyUnicode_FromUnicode(NULL, size);
3997 if (res == NULL)
Walter Dörwald4894c302003-10-24 14:25:28 +00003998 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003999 if (size == 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004000 return res;
4001 str = PyUnicode_AS_UNICODE(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004002
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004003 while (p<endp) {
4004 /* try to encode it */
4005 PyObject *x = NULL;
Walter Dörwald4894c302003-10-24 14:25:28 +00004006 if (charmaptranslate_output(startp, p, size, mapping, &res, &str, &x)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004007 Py_XDECREF(x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004008 goto onError;
4009 }
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00004010 Py_XDECREF(x);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004011 if (x!=Py_None) /* it worked => adjust input pointer */
4012 ++p;
4013 else { /* untranslatable character */
4014 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004015 Py_ssize_t repsize;
4016 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004017 Py_UNICODE *uni2;
4018 /* startpos for collecting untranslatable chars */
4019 const Py_UNICODE *collstart = p;
4020 const Py_UNICODE *collend = p+1;
4021 const Py_UNICODE *coll;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004022
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004023 /* find all untranslatable characters */
4024 while (collend < endp) {
Walter Dörwald4894c302003-10-24 14:25:28 +00004025 if (charmaptranslate_lookup(*collend, mapping, &x))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004026 goto onError;
4027 Py_XDECREF(x);
4028 if (x!=Py_None)
4029 break;
4030 ++collend;
4031 }
4032 /* cache callback name lookup
4033 * (if not done yet, i.e. it's the first error) */
4034 if (known_errorHandler==-1) {
4035 if ((errors==NULL) || (!strcmp(errors, "strict")))
4036 known_errorHandler = 1;
4037 else if (!strcmp(errors, "replace"))
4038 known_errorHandler = 2;
4039 else if (!strcmp(errors, "ignore"))
4040 known_errorHandler = 3;
4041 else if (!strcmp(errors, "xmlcharrefreplace"))
4042 known_errorHandler = 4;
4043 else
4044 known_errorHandler = 0;
4045 }
4046 switch (known_errorHandler) {
4047 case 1: /* strict */
4048 raise_translate_exception(&exc, startp, size, collstart-startp, collend-startp, reason);
4049 goto onError;
4050 case 2: /* replace */
4051 /* No need to check for space, this is a 1:1 replacement */
4052 for (coll = collstart; coll<collend; ++coll)
4053 *str++ = '?';
4054 /* fall through */
4055 case 3: /* ignore */
4056 p = collend;
4057 break;
4058 case 4: /* xmlcharrefreplace */
4059 /* generate replacement (temporarily (mis)uses p) */
4060 for (p = collstart; p < collend; ++p) {
4061 char buffer[2+29+1+1];
4062 char *cp;
4063 sprintf(buffer, "&#%d;", (int)*p);
Walter Dörwald4894c302003-10-24 14:25:28 +00004064 if (charmaptranslate_makespace(&res, &str,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004065 (str-PyUnicode_AS_UNICODE(res))+strlen(buffer)+(endp-collend)))
4066 goto onError;
4067 for (cp = buffer; *cp; ++cp)
4068 *str++ = *cp;
4069 }
4070 p = collend;
4071 break;
4072 default:
4073 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
4074 reason, startp, size, &exc,
4075 collstart-startp, collend-startp, &newpos);
4076 if (repunicode == NULL)
4077 goto onError;
4078 /* generate replacement */
4079 repsize = PyUnicode_GET_SIZE(repunicode);
Walter Dörwald4894c302003-10-24 14:25:28 +00004080 if (charmaptranslate_makespace(&res, &str,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004081 (str-PyUnicode_AS_UNICODE(res))+repsize+(endp-collend))) {
4082 Py_DECREF(repunicode);
4083 goto onError;
4084 }
4085 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2)
4086 *str++ = *uni2;
4087 p = startp + newpos;
4088 Py_DECREF(repunicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004089 }
4090 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004091 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004092 /* Resize if we allocated to much */
4093 respos = str-PyUnicode_AS_UNICODE(res);
Walter Dörwald4894c302003-10-24 14:25:28 +00004094 if (respos<PyUnicode_GET_SIZE(res)) {
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00004095 if (_PyUnicode_Resize(&res, respos) < 0)
Guido van Rossumfd4b9572000-04-10 13:51:10 +00004096 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004097 }
4098 Py_XDECREF(exc);
4099 Py_XDECREF(errorHandler);
4100 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004101
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004102 onError:
4103 Py_XDECREF(res);
4104 Py_XDECREF(exc);
4105 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004106 return NULL;
4107}
4108
4109PyObject *PyUnicode_Translate(PyObject *str,
4110 PyObject *mapping,
4111 const char *errors)
4112{
4113 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00004114
Guido van Rossumd57fd912000-03-10 22:53:23 +00004115 str = PyUnicode_FromObject(str);
4116 if (str == NULL)
4117 goto onError;
4118 result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str),
4119 PyUnicode_GET_SIZE(str),
4120 mapping,
4121 errors);
4122 Py_DECREF(str);
4123 return result;
Tim Petersced69f82003-09-16 20:30:58 +00004124
Guido van Rossumd57fd912000-03-10 22:53:23 +00004125 onError:
4126 Py_XDECREF(str);
4127 return NULL;
4128}
Tim Petersced69f82003-09-16 20:30:58 +00004129
Guido van Rossum9e896b32000-04-05 20:11:21 +00004130/* --- Decimal Encoder ---------------------------------------------------- */
4131
4132int PyUnicode_EncodeDecimal(Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004133 Py_ssize_t length,
Guido van Rossum9e896b32000-04-05 20:11:21 +00004134 char *output,
4135 const char *errors)
4136{
4137 Py_UNICODE *p, *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004138 PyObject *errorHandler = NULL;
4139 PyObject *exc = NULL;
4140 const char *encoding = "decimal";
4141 const char *reason = "invalid decimal Unicode string";
4142 /* the following variable is used for caching string comparisons
4143 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
4144 int known_errorHandler = -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00004145
4146 if (output == NULL) {
4147 PyErr_BadArgument();
4148 return -1;
4149 }
4150
4151 p = s;
4152 end = s + length;
4153 while (p < end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004154 register Py_UNICODE ch = *p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00004155 int decimal;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004156 PyObject *repunicode;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004157 Py_ssize_t repsize;
4158 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004159 Py_UNICODE *uni2;
4160 Py_UNICODE *collstart;
4161 Py_UNICODE *collend;
Tim Petersced69f82003-09-16 20:30:58 +00004162
Guido van Rossum9e896b32000-04-05 20:11:21 +00004163 if (Py_UNICODE_ISSPACE(ch)) {
4164 *output++ = ' ';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004165 ++p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00004166 continue;
4167 }
4168 decimal = Py_UNICODE_TODECIMAL(ch);
4169 if (decimal >= 0) {
4170 *output++ = '0' + decimal;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004171 ++p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00004172 continue;
4173 }
Guido van Rossumba477042000-04-06 18:18:10 +00004174 if (0 < ch && ch < 256) {
Guido van Rossum42c29aa2000-05-03 23:58:29 +00004175 *output++ = (char)ch;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004176 ++p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00004177 continue;
4178 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004179 /* All other characters are considered unencodable */
4180 collstart = p;
4181 collend = p+1;
4182 while (collend < end) {
4183 if ((0 < *collend && *collend < 256) ||
4184 !Py_UNICODE_ISSPACE(*collend) ||
4185 Py_UNICODE_TODECIMAL(*collend))
4186 break;
Guido van Rossum9e896b32000-04-05 20:11:21 +00004187 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004188 /* cache callback name lookup
4189 * (if not done yet, i.e. it's the first error) */
4190 if (known_errorHandler==-1) {
4191 if ((errors==NULL) || (!strcmp(errors, "strict")))
4192 known_errorHandler = 1;
4193 else if (!strcmp(errors, "replace"))
4194 known_errorHandler = 2;
4195 else if (!strcmp(errors, "ignore"))
4196 known_errorHandler = 3;
4197 else if (!strcmp(errors, "xmlcharrefreplace"))
4198 known_errorHandler = 4;
4199 else
4200 known_errorHandler = 0;
4201 }
4202 switch (known_errorHandler) {
4203 case 1: /* strict */
4204 raise_encode_exception(&exc, encoding, s, length, collstart-s, collend-s, reason);
4205 goto onError;
4206 case 2: /* replace */
4207 for (p = collstart; p < collend; ++p)
4208 *output++ = '?';
4209 /* fall through */
4210 case 3: /* ignore */
4211 p = collend;
4212 break;
4213 case 4: /* xmlcharrefreplace */
4214 /* generate replacement (temporarily (mis)uses p) */
4215 for (p = collstart; p < collend; ++p)
4216 output += sprintf(output, "&#%d;", (int)*p);
4217 p = collend;
4218 break;
4219 default:
4220 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
4221 encoding, reason, s, length, &exc,
4222 collstart-s, collend-s, &newpos);
4223 if (repunicode == NULL)
4224 goto onError;
4225 /* generate replacement */
4226 repsize = PyUnicode_GET_SIZE(repunicode);
4227 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
4228 Py_UNICODE ch = *uni2;
4229 if (Py_UNICODE_ISSPACE(ch))
4230 *output++ = ' ';
4231 else {
4232 decimal = Py_UNICODE_TODECIMAL(ch);
4233 if (decimal >= 0)
4234 *output++ = '0' + decimal;
4235 else if (0 < ch && ch < 256)
4236 *output++ = (char)ch;
4237 else {
4238 Py_DECREF(repunicode);
4239 raise_encode_exception(&exc, encoding,
4240 s, length, collstart-s, collend-s, reason);
4241 goto onError;
4242 }
4243 }
4244 }
4245 p = s + newpos;
4246 Py_DECREF(repunicode);
Guido van Rossum9e896b32000-04-05 20:11:21 +00004247 }
4248 }
4249 /* 0-terminate the output string */
4250 *output++ = '\0';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004251 Py_XDECREF(exc);
4252 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00004253 return 0;
4254
4255 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004256 Py_XDECREF(exc);
4257 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00004258 return -1;
4259}
4260
Guido van Rossumd57fd912000-03-10 22:53:23 +00004261/* --- Helpers ------------------------------------------------------------ */
4262
Thomas Wouters477c8d52006-05-27 19:21:47 +00004263#define STRINGLIB_CHAR Py_UNICODE
4264
4265#define STRINGLIB_LEN PyUnicode_GET_SIZE
4266#define STRINGLIB_NEW PyUnicode_FromUnicode
4267#define STRINGLIB_STR PyUnicode_AS_UNICODE
4268
4269Py_LOCAL_INLINE(int)
4270STRINGLIB_CMP(const Py_UNICODE* str, const Py_UNICODE* other, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004271{
Thomas Wouters477c8d52006-05-27 19:21:47 +00004272 if (str[0] != other[0])
4273 return 1;
4274 return memcmp((void*) str, (void*) other, len * sizeof(Py_UNICODE));
Guido van Rossumd57fd912000-03-10 22:53:23 +00004275}
4276
Thomas Wouters477c8d52006-05-27 19:21:47 +00004277#define STRINGLIB_EMPTY unicode_empty
4278
4279#include "stringlib/fastsearch.h"
4280
4281#include "stringlib/count.h"
4282#include "stringlib/find.h"
4283#include "stringlib/partition.h"
4284
4285/* helper macro to fixup start/end slice values */
4286#define FIX_START_END(obj) \
4287 if (start < 0) \
4288 start += (obj)->length; \
4289 if (start < 0) \
4290 start = 0; \
4291 if (end > (obj)->length) \
4292 end = (obj)->length; \
4293 if (end < 0) \
4294 end += (obj)->length; \
4295 if (end < 0) \
4296 end = 0;
4297
Martin v. Löwis18e16552006-02-15 17:27:45 +00004298Py_ssize_t PyUnicode_Count(PyObject *str,
Thomas Wouters477c8d52006-05-27 19:21:47 +00004299 PyObject *substr,
4300 Py_ssize_t start,
4301 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004302{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004303 Py_ssize_t result;
Thomas Wouters477c8d52006-05-27 19:21:47 +00004304 PyUnicodeObject* str_obj;
4305 PyUnicodeObject* sub_obj;
Tim Petersced69f82003-09-16 20:30:58 +00004306
Thomas Wouters477c8d52006-05-27 19:21:47 +00004307 str_obj = (PyUnicodeObject*) PyUnicode_FromObject(str);
4308 if (!str_obj)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004309 return -1;
Thomas Wouters477c8d52006-05-27 19:21:47 +00004310 sub_obj = (PyUnicodeObject*) PyUnicode_FromObject(substr);
4311 if (!sub_obj) {
4312 Py_DECREF(str_obj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004313 return -1;
4314 }
Tim Petersced69f82003-09-16 20:30:58 +00004315
Thomas Wouters477c8d52006-05-27 19:21:47 +00004316 FIX_START_END(str_obj);
Tim Petersced69f82003-09-16 20:30:58 +00004317
Thomas Wouters477c8d52006-05-27 19:21:47 +00004318 result = stringlib_count(
4319 str_obj->str + start, end - start, sub_obj->str, sub_obj->length
4320 );
4321
4322 Py_DECREF(sub_obj);
4323 Py_DECREF(str_obj);
4324
Guido van Rossumd57fd912000-03-10 22:53:23 +00004325 return result;
4326}
4327
Martin v. Löwis18e16552006-02-15 17:27:45 +00004328Py_ssize_t PyUnicode_Find(PyObject *str,
Thomas Wouters477c8d52006-05-27 19:21:47 +00004329 PyObject *sub,
4330 Py_ssize_t start,
4331 Py_ssize_t end,
4332 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004333{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004334 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00004335
Guido van Rossumd57fd912000-03-10 22:53:23 +00004336 str = PyUnicode_FromObject(str);
Thomas Wouters477c8d52006-05-27 19:21:47 +00004337 if (!str)
Marc-André Lemburg4da6fd62002-05-29 11:33:13 +00004338 return -2;
Thomas Wouters477c8d52006-05-27 19:21:47 +00004339 sub = PyUnicode_FromObject(sub);
4340 if (!sub) {
Marc-André Lemburg41644392002-05-29 13:46:29 +00004341 Py_DECREF(str);
Marc-André Lemburg4da6fd62002-05-29 11:33:13 +00004342 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004343 }
Tim Petersced69f82003-09-16 20:30:58 +00004344
Thomas Wouters477c8d52006-05-27 19:21:47 +00004345 if (direction > 0)
4346 result = stringlib_find_slice(
4347 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
4348 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
4349 start, end
4350 );
4351 else
4352 result = stringlib_rfind_slice(
4353 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
4354 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
4355 start, end
4356 );
4357
Guido van Rossumd57fd912000-03-10 22:53:23 +00004358 Py_DECREF(str);
Thomas Wouters477c8d52006-05-27 19:21:47 +00004359 Py_DECREF(sub);
4360
Guido van Rossumd57fd912000-03-10 22:53:23 +00004361 return result;
4362}
4363
Tim Petersced69f82003-09-16 20:30:58 +00004364static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004365int tailmatch(PyUnicodeObject *self,
4366 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004367 Py_ssize_t start,
4368 Py_ssize_t end,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004369 int direction)
4370{
Guido van Rossumd57fd912000-03-10 22:53:23 +00004371 if (substring->length == 0)
4372 return 1;
4373
Thomas Wouters477c8d52006-05-27 19:21:47 +00004374 FIX_START_END(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004375
4376 end -= substring->length;
4377 if (end < start)
4378 return 0;
4379
4380 if (direction > 0) {
4381 if (Py_UNICODE_MATCH(self, end, substring))
4382 return 1;
4383 } else {
4384 if (Py_UNICODE_MATCH(self, start, substring))
4385 return 1;
4386 }
4387
4388 return 0;
4389}
4390
Martin v. Löwis18e16552006-02-15 17:27:45 +00004391Py_ssize_t PyUnicode_Tailmatch(PyObject *str,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004392 PyObject *substr,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004393 Py_ssize_t start,
4394 Py_ssize_t end,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004395 int direction)
4396{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004397 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00004398
Guido van Rossumd57fd912000-03-10 22:53:23 +00004399 str = PyUnicode_FromObject(str);
4400 if (str == NULL)
4401 return -1;
4402 substr = PyUnicode_FromObject(substr);
4403 if (substr == NULL) {
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00004404 Py_DECREF(str);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004405 return -1;
4406 }
Tim Petersced69f82003-09-16 20:30:58 +00004407
Guido van Rossumd57fd912000-03-10 22:53:23 +00004408 result = tailmatch((PyUnicodeObject *)str,
4409 (PyUnicodeObject *)substr,
4410 start, end, direction);
4411 Py_DECREF(str);
4412 Py_DECREF(substr);
4413 return result;
4414}
4415
Guido van Rossumd57fd912000-03-10 22:53:23 +00004416/* Apply fixfct filter to the Unicode object self and return a
4417 reference to the modified object */
4418
Tim Petersced69f82003-09-16 20:30:58 +00004419static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004420PyObject *fixup(PyUnicodeObject *self,
4421 int (*fixfct)(PyUnicodeObject *s))
4422{
4423
4424 PyUnicodeObject *u;
4425
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004426 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004427 if (u == NULL)
4428 return NULL;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004429
4430 Py_UNICODE_COPY(u->str, self->str, self->length);
4431
Tim Peters7a29bd52001-09-12 03:03:31 +00004432 if (!fixfct(u) && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004433 /* fixfct should return TRUE if it modified the buffer. If
4434 FALSE, return a reference to the original buffer instead
4435 (to save space, not time) */
4436 Py_INCREF(self);
4437 Py_DECREF(u);
4438 return (PyObject*) self;
4439 }
4440 return (PyObject*) u;
4441}
4442
Tim Petersced69f82003-09-16 20:30:58 +00004443static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004444int fixupper(PyUnicodeObject *self)
4445{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004446 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004447 Py_UNICODE *s = self->str;
4448 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00004449
Guido van Rossumd57fd912000-03-10 22:53:23 +00004450 while (len-- > 0) {
4451 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00004452
Guido van Rossumd57fd912000-03-10 22:53:23 +00004453 ch = Py_UNICODE_TOUPPER(*s);
4454 if (ch != *s) {
4455 status = 1;
4456 *s = ch;
4457 }
4458 s++;
4459 }
4460
4461 return status;
4462}
4463
Tim Petersced69f82003-09-16 20:30:58 +00004464static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004465int fixlower(PyUnicodeObject *self)
4466{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004467 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004468 Py_UNICODE *s = self->str;
4469 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00004470
Guido van Rossumd57fd912000-03-10 22:53:23 +00004471 while (len-- > 0) {
4472 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00004473
Guido van Rossumd57fd912000-03-10 22:53:23 +00004474 ch = Py_UNICODE_TOLOWER(*s);
4475 if (ch != *s) {
4476 status = 1;
4477 *s = ch;
4478 }
4479 s++;
4480 }
4481
4482 return status;
4483}
4484
Tim Petersced69f82003-09-16 20:30:58 +00004485static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004486int fixswapcase(PyUnicodeObject *self)
4487{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004488 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004489 Py_UNICODE *s = self->str;
4490 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00004491
Guido van Rossumd57fd912000-03-10 22:53:23 +00004492 while (len-- > 0) {
4493 if (Py_UNICODE_ISUPPER(*s)) {
4494 *s = Py_UNICODE_TOLOWER(*s);
4495 status = 1;
4496 } else if (Py_UNICODE_ISLOWER(*s)) {
4497 *s = Py_UNICODE_TOUPPER(*s);
4498 status = 1;
4499 }
4500 s++;
4501 }
4502
4503 return status;
4504}
4505
Tim Petersced69f82003-09-16 20:30:58 +00004506static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004507int fixcapitalize(PyUnicodeObject *self)
4508{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004509 Py_ssize_t len = self->length;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00004510 Py_UNICODE *s = self->str;
4511 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00004512
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00004513 if (len == 0)
4514 return 0;
4515 if (Py_UNICODE_ISLOWER(*s)) {
4516 *s = Py_UNICODE_TOUPPER(*s);
4517 status = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004518 }
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00004519 s++;
4520 while (--len > 0) {
4521 if (Py_UNICODE_ISUPPER(*s)) {
4522 *s = Py_UNICODE_TOLOWER(*s);
4523 status = 1;
4524 }
4525 s++;
4526 }
4527 return status;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004528}
4529
4530static
4531int fixtitle(PyUnicodeObject *self)
4532{
4533 register Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
4534 register Py_UNICODE *e;
4535 int previous_is_cased;
4536
4537 /* Shortcut for single character strings */
4538 if (PyUnicode_GET_SIZE(self) == 1) {
4539 Py_UNICODE ch = Py_UNICODE_TOTITLE(*p);
4540 if (*p != ch) {
4541 *p = ch;
4542 return 1;
4543 }
4544 else
4545 return 0;
4546 }
Tim Petersced69f82003-09-16 20:30:58 +00004547
Guido van Rossumd57fd912000-03-10 22:53:23 +00004548 e = p + PyUnicode_GET_SIZE(self);
4549 previous_is_cased = 0;
4550 for (; p < e; p++) {
4551 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00004552
Guido van Rossumd57fd912000-03-10 22:53:23 +00004553 if (previous_is_cased)
4554 *p = Py_UNICODE_TOLOWER(ch);
4555 else
4556 *p = Py_UNICODE_TOTITLE(ch);
Tim Petersced69f82003-09-16 20:30:58 +00004557
4558 if (Py_UNICODE_ISLOWER(ch) ||
4559 Py_UNICODE_ISUPPER(ch) ||
Guido van Rossumd57fd912000-03-10 22:53:23 +00004560 Py_UNICODE_ISTITLE(ch))
4561 previous_is_cased = 1;
4562 else
4563 previous_is_cased = 0;
4564 }
4565 return 1;
4566}
4567
Tim Peters8ce9f162004-08-27 01:49:32 +00004568PyObject *
4569PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004570{
Tim Peters8ce9f162004-08-27 01:49:32 +00004571 PyObject *internal_separator = NULL;
Skip Montanaro6543b452004-09-16 03:28:13 +00004572 const Py_UNICODE blank = ' ';
4573 const Py_UNICODE *sep = &blank;
Thomas Wouters477c8d52006-05-27 19:21:47 +00004574 Py_ssize_t seplen = 1;
Tim Peters05eba1f2004-08-27 21:32:02 +00004575 PyUnicodeObject *res = NULL; /* the result */
Thomas Wouters477c8d52006-05-27 19:21:47 +00004576 Py_ssize_t res_alloc = 100; /* # allocated bytes for string in res */
4577 Py_ssize_t res_used; /* # used bytes */
Tim Peters05eba1f2004-08-27 21:32:02 +00004578 Py_UNICODE *res_p; /* pointer to free byte in res's string area */
4579 PyObject *fseq; /* PySequence_Fast(seq) */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004580 Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */
Tim Peters8ce9f162004-08-27 01:49:32 +00004581 PyObject *item;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004582 Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004583
Tim Peters05eba1f2004-08-27 21:32:02 +00004584 fseq = PySequence_Fast(seq, "");
4585 if (fseq == NULL) {
Tim Peters05eba1f2004-08-27 21:32:02 +00004586 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +00004587 }
4588
Tim Peters91879ab2004-08-27 22:35:44 +00004589 /* Grrrr. A codec may be invoked to convert str objects to
4590 * Unicode, and so it's possible to call back into Python code
4591 * during PyUnicode_FromObject(), and so it's possible for a sick
4592 * codec to change the size of fseq (if seq is a list). Therefore
4593 * we have to keep refetching the size -- can't assume seqlen
4594 * is invariant.
4595 */
Tim Peters05eba1f2004-08-27 21:32:02 +00004596 seqlen = PySequence_Fast_GET_SIZE(fseq);
4597 /* If empty sequence, return u"". */
4598 if (seqlen == 0) {
4599 res = _PyUnicode_New(0); /* empty sequence; return u"" */
4600 goto Done;
4601 }
4602 /* If singleton sequence with an exact Unicode, return that. */
4603 if (seqlen == 1) {
4604 item = PySequence_Fast_GET_ITEM(fseq, 0);
4605 if (PyUnicode_CheckExact(item)) {
4606 Py_INCREF(item);
4607 res = (PyUnicodeObject *)item;
4608 goto Done;
4609 }
Tim Peters8ce9f162004-08-27 01:49:32 +00004610 }
4611
Tim Peters05eba1f2004-08-27 21:32:02 +00004612 /* At least two items to join, or one that isn't exact Unicode. */
4613 if (seqlen > 1) {
4614 /* Set up sep and seplen -- they're needed. */
4615 if (separator == NULL) {
4616 sep = &blank;
4617 seplen = 1;
4618 }
4619 else {
4620 internal_separator = PyUnicode_FromObject(separator);
4621 if (internal_separator == NULL)
4622 goto onError;
4623 sep = PyUnicode_AS_UNICODE(internal_separator);
4624 seplen = PyUnicode_GET_SIZE(internal_separator);
Tim Peters91879ab2004-08-27 22:35:44 +00004625 /* In case PyUnicode_FromObject() mutated seq. */
4626 seqlen = PySequence_Fast_GET_SIZE(fseq);
Tim Peters05eba1f2004-08-27 21:32:02 +00004627 }
4628 }
4629
4630 /* Get space. */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004631 res = _PyUnicode_New(res_alloc);
Tim Peters05eba1f2004-08-27 21:32:02 +00004632 if (res == NULL)
Tim Peters8ce9f162004-08-27 01:49:32 +00004633 goto onError;
Tim Peters05eba1f2004-08-27 21:32:02 +00004634 res_p = PyUnicode_AS_UNICODE(res);
4635 res_used = 0;
Tim Peters8ce9f162004-08-27 01:49:32 +00004636
Tim Peters05eba1f2004-08-27 21:32:02 +00004637 for (i = 0; i < seqlen; ++i) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00004638 Py_ssize_t itemlen;
4639 Py_ssize_t new_res_used;
Tim Peters05eba1f2004-08-27 21:32:02 +00004640
4641 item = PySequence_Fast_GET_ITEM(fseq, i);
4642 /* Convert item to Unicode. */
4643 if (! PyUnicode_Check(item) && ! PyString_Check(item)) {
4644 PyErr_Format(PyExc_TypeError,
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004645 "sequence item %zd: expected string or Unicode,"
Tim Peters05eba1f2004-08-27 21:32:02 +00004646 " %.80s found",
4647 i, item->ob_type->tp_name);
Tim Peters2cfe3682001-05-05 05:36:48 +00004648 goto onError;
Tim Peters8ce9f162004-08-27 01:49:32 +00004649 }
Tim Peters05eba1f2004-08-27 21:32:02 +00004650 item = PyUnicode_FromObject(item);
4651 if (item == NULL)
4652 goto onError;
4653 /* We own a reference to item from here on. */
Tim Petersced69f82003-09-16 20:30:58 +00004654
Tim Peters91879ab2004-08-27 22:35:44 +00004655 /* In case PyUnicode_FromObject() mutated seq. */
4656 seqlen = PySequence_Fast_GET_SIZE(fseq);
4657
Tim Peters8ce9f162004-08-27 01:49:32 +00004658 /* Make sure we have enough space for the separator and the item. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004659 itemlen = PyUnicode_GET_SIZE(item);
Tim Peters05eba1f2004-08-27 21:32:02 +00004660 new_res_used = res_used + itemlen;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004661 if (new_res_used < 0)
Tim Peters8ce9f162004-08-27 01:49:32 +00004662 goto Overflow;
Tim Peters05eba1f2004-08-27 21:32:02 +00004663 if (i < seqlen - 1) {
4664 new_res_used += seplen;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004665 if (new_res_used < 0)
Tim Peters05eba1f2004-08-27 21:32:02 +00004666 goto Overflow;
4667 }
4668 if (new_res_used > res_alloc) {
4669 /* double allocated size until it's big enough */
Tim Peters8ce9f162004-08-27 01:49:32 +00004670 do {
Tim Peters05eba1f2004-08-27 21:32:02 +00004671 res_alloc += res_alloc;
Thomas Wouters477c8d52006-05-27 19:21:47 +00004672 if (res_alloc <= 0)
Tim Peters8ce9f162004-08-27 01:49:32 +00004673 goto Overflow;
Tim Peters05eba1f2004-08-27 21:32:02 +00004674 } while (new_res_used > res_alloc);
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004675 if (_PyUnicode_Resize(&res, res_alloc) < 0) {
Marc-André Lemburg3508e302001-09-20 17:22:58 +00004676 Py_DECREF(item);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004677 goto onError;
Marc-André Lemburg3508e302001-09-20 17:22:58 +00004678 }
Tim Peters05eba1f2004-08-27 21:32:02 +00004679 res_p = PyUnicode_AS_UNICODE(res) + res_used;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004680 }
Tim Peters05eba1f2004-08-27 21:32:02 +00004681
4682 /* Copy item, and maybe the separator. */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004683 Py_UNICODE_COPY(res_p, PyUnicode_AS_UNICODE(item), itemlen);
Tim Peters05eba1f2004-08-27 21:32:02 +00004684 res_p += itemlen;
4685 if (i < seqlen - 1) {
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004686 Py_UNICODE_COPY(res_p, sep, seplen);
Tim Peters05eba1f2004-08-27 21:32:02 +00004687 res_p += seplen;
4688 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004689 Py_DECREF(item);
Tim Peters05eba1f2004-08-27 21:32:02 +00004690 res_used = new_res_used;
4691 }
Tim Peters8ce9f162004-08-27 01:49:32 +00004692
Tim Peters05eba1f2004-08-27 21:32:02 +00004693 /* Shrink res to match the used area; this probably can't fail,
4694 * but it's cheap to check.
4695 */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004696 if (_PyUnicode_Resize(&res, res_used) < 0)
Tim Peters8ce9f162004-08-27 01:49:32 +00004697 goto onError;
4698
4699 Done:
4700 Py_XDECREF(internal_separator);
Tim Peters05eba1f2004-08-27 21:32:02 +00004701 Py_DECREF(fseq);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004702 return (PyObject *)res;
4703
Tim Peters8ce9f162004-08-27 01:49:32 +00004704 Overflow:
4705 PyErr_SetString(PyExc_OverflowError,
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004706 "join() result is too long for a Python string");
Tim Peters8ce9f162004-08-27 01:49:32 +00004707 Py_DECREF(item);
4708 /* fall through */
4709
Guido van Rossumd57fd912000-03-10 22:53:23 +00004710 onError:
Tim Peters8ce9f162004-08-27 01:49:32 +00004711 Py_XDECREF(internal_separator);
Tim Peters05eba1f2004-08-27 21:32:02 +00004712 Py_DECREF(fseq);
Tim Peters8ce9f162004-08-27 01:49:32 +00004713 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004714 return NULL;
4715}
4716
Tim Petersced69f82003-09-16 20:30:58 +00004717static
4718PyUnicodeObject *pad(PyUnicodeObject *self,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004719 Py_ssize_t left,
4720 Py_ssize_t right,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004721 Py_UNICODE fill)
4722{
4723 PyUnicodeObject *u;
4724
4725 if (left < 0)
4726 left = 0;
4727 if (right < 0)
4728 right = 0;
4729
Tim Peters7a29bd52001-09-12 03:03:31 +00004730 if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004731 Py_INCREF(self);
4732 return self;
4733 }
4734
4735 u = _PyUnicode_New(left + self->length + right);
4736 if (u) {
4737 if (left)
4738 Py_UNICODE_FILL(u->str, fill, left);
4739 Py_UNICODE_COPY(u->str + left, self->str, self->length);
4740 if (right)
4741 Py_UNICODE_FILL(u->str + left + self->length, fill, right);
4742 }
4743
4744 return u;
4745}
4746
4747#define SPLIT_APPEND(data, left, right) \
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004748 str = PyUnicode_FromUnicode((data) + (left), (right) - (left)); \
Guido van Rossumd57fd912000-03-10 22:53:23 +00004749 if (!str) \
4750 goto onError; \
4751 if (PyList_Append(list, str)) { \
4752 Py_DECREF(str); \
4753 goto onError; \
4754 } \
4755 else \
4756 Py_DECREF(str);
4757
4758static
4759PyObject *split_whitespace(PyUnicodeObject *self,
4760 PyObject *list,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004761 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004762{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004763 register Py_ssize_t i;
4764 register Py_ssize_t j;
4765 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004766 PyObject *str;
4767
4768 for (i = j = 0; i < len; ) {
4769 /* find a token */
4770 while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
4771 i++;
4772 j = i;
4773 while (i < len && !Py_UNICODE_ISSPACE(self->str[i]))
4774 i++;
4775 if (j < i) {
4776 if (maxcount-- <= 0)
4777 break;
4778 SPLIT_APPEND(self->str, j, i);
4779 while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
4780 i++;
4781 j = i;
4782 }
4783 }
4784 if (j < len) {
4785 SPLIT_APPEND(self->str, j, len);
4786 }
4787 return list;
4788
4789 onError:
4790 Py_DECREF(list);
4791 return NULL;
4792}
4793
4794PyObject *PyUnicode_Splitlines(PyObject *string,
Guido van Rossum86662912000-04-11 15:38:46 +00004795 int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004796{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004797 register Py_ssize_t i;
4798 register Py_ssize_t j;
4799 Py_ssize_t len;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004800 PyObject *list;
4801 PyObject *str;
4802 Py_UNICODE *data;
4803
4804 string = PyUnicode_FromObject(string);
4805 if (string == NULL)
4806 return NULL;
4807 data = PyUnicode_AS_UNICODE(string);
4808 len = PyUnicode_GET_SIZE(string);
4809
Guido van Rossumd57fd912000-03-10 22:53:23 +00004810 list = PyList_New(0);
4811 if (!list)
4812 goto onError;
4813
4814 for (i = j = 0; i < len; ) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00004815 Py_ssize_t eol;
Tim Petersced69f82003-09-16 20:30:58 +00004816
Guido van Rossumd57fd912000-03-10 22:53:23 +00004817 /* Find a line and append it */
Thomas Wouters477c8d52006-05-27 19:21:47 +00004818 while (i < len && !BLOOM_LINEBREAK(data[i]))
Guido van Rossumd57fd912000-03-10 22:53:23 +00004819 i++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004820
4821 /* Skip the line break reading CRLF as one line break */
Guido van Rossum86662912000-04-11 15:38:46 +00004822 eol = i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004823 if (i < len) {
4824 if (data[i] == '\r' && i + 1 < len &&
4825 data[i+1] == '\n')
4826 i += 2;
4827 else
4828 i++;
Guido van Rossum86662912000-04-11 15:38:46 +00004829 if (keepends)
4830 eol = i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004831 }
Guido van Rossum86662912000-04-11 15:38:46 +00004832 SPLIT_APPEND(data, j, eol);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004833 j = i;
4834 }
4835 if (j < len) {
4836 SPLIT_APPEND(data, j, len);
4837 }
4838
4839 Py_DECREF(string);
4840 return list;
4841
4842 onError:
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00004843 Py_XDECREF(list);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004844 Py_DECREF(string);
4845 return NULL;
4846}
4847
Tim Petersced69f82003-09-16 20:30:58 +00004848static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004849PyObject *split_char(PyUnicodeObject *self,
4850 PyObject *list,
4851 Py_UNICODE ch,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004852 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004853{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004854 register Py_ssize_t i;
4855 register Py_ssize_t j;
4856 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004857 PyObject *str;
4858
4859 for (i = j = 0; i < len; ) {
4860 if (self->str[i] == ch) {
4861 if (maxcount-- <= 0)
4862 break;
4863 SPLIT_APPEND(self->str, j, i);
4864 i = j = i + 1;
4865 } else
4866 i++;
4867 }
4868 if (j <= len) {
4869 SPLIT_APPEND(self->str, j, len);
4870 }
4871 return list;
4872
4873 onError:
4874 Py_DECREF(list);
4875 return NULL;
4876}
4877
Tim Petersced69f82003-09-16 20:30:58 +00004878static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004879PyObject *split_substring(PyUnicodeObject *self,
4880 PyObject *list,
4881 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004882 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004883{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004884 register Py_ssize_t i;
4885 register Py_ssize_t j;
4886 Py_ssize_t len = self->length;
4887 Py_ssize_t sublen = substring->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004888 PyObject *str;
4889
Guido van Rossumcda4f9a2000-12-19 02:23:19 +00004890 for (i = j = 0; i <= len - sublen; ) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004891 if (Py_UNICODE_MATCH(self, i, substring)) {
4892 if (maxcount-- <= 0)
4893 break;
4894 SPLIT_APPEND(self->str, j, i);
4895 i = j = i + sublen;
4896 } else
4897 i++;
4898 }
4899 if (j <= len) {
4900 SPLIT_APPEND(self->str, j, len);
4901 }
4902 return list;
4903
4904 onError:
4905 Py_DECREF(list);
4906 return NULL;
4907}
4908
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004909static
4910PyObject *rsplit_whitespace(PyUnicodeObject *self,
4911 PyObject *list,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004912 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004913{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004914 register Py_ssize_t i;
4915 register Py_ssize_t j;
4916 Py_ssize_t len = self->length;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004917 PyObject *str;
4918
4919 for (i = j = len - 1; i >= 0; ) {
4920 /* find a token */
4921 while (i >= 0 && Py_UNICODE_ISSPACE(self->str[i]))
4922 i--;
4923 j = i;
4924 while (i >= 0 && !Py_UNICODE_ISSPACE(self->str[i]))
4925 i--;
4926 if (j > i) {
4927 if (maxcount-- <= 0)
4928 break;
Thomas Wouters477c8d52006-05-27 19:21:47 +00004929 SPLIT_APPEND(self->str, i + 1, j + 1);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004930 while (i >= 0 && Py_UNICODE_ISSPACE(self->str[i]))
4931 i--;
4932 j = i;
4933 }
4934 }
4935 if (j >= 0) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00004936 SPLIT_APPEND(self->str, 0, j + 1);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004937 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00004938 if (PyList_Reverse(list) < 0)
4939 goto onError;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004940 return list;
4941
4942 onError:
4943 Py_DECREF(list);
4944 return NULL;
4945}
4946
4947static
4948PyObject *rsplit_char(PyUnicodeObject *self,
4949 PyObject *list,
4950 Py_UNICODE ch,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004951 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004952{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004953 register Py_ssize_t i;
4954 register Py_ssize_t j;
4955 Py_ssize_t len = self->length;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004956 PyObject *str;
4957
4958 for (i = j = len - 1; i >= 0; ) {
4959 if (self->str[i] == ch) {
4960 if (maxcount-- <= 0)
4961 break;
Thomas Wouters477c8d52006-05-27 19:21:47 +00004962 SPLIT_APPEND(self->str, i + 1, j + 1);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004963 j = i = i - 1;
4964 } else
4965 i--;
4966 }
Hye-Shik Chang7fc4cf52003-12-23 09:10:16 +00004967 if (j >= -1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00004968 SPLIT_APPEND(self->str, 0, j + 1);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004969 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00004970 if (PyList_Reverse(list) < 0)
4971 goto onError;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004972 return list;
4973
4974 onError:
4975 Py_DECREF(list);
4976 return NULL;
4977}
4978
4979static
4980PyObject *rsplit_substring(PyUnicodeObject *self,
4981 PyObject *list,
4982 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004983 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004984{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004985 register Py_ssize_t i;
4986 register Py_ssize_t j;
4987 Py_ssize_t len = self->length;
4988 Py_ssize_t sublen = substring->length;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004989 PyObject *str;
4990
4991 for (i = len - sublen, j = len; i >= 0; ) {
4992 if (Py_UNICODE_MATCH(self, i, substring)) {
4993 if (maxcount-- <= 0)
4994 break;
Thomas Wouters477c8d52006-05-27 19:21:47 +00004995 SPLIT_APPEND(self->str, i + sublen, j);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004996 j = i;
4997 i -= sublen;
4998 } else
4999 i--;
5000 }
5001 if (j >= 0) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00005002 SPLIT_APPEND(self->str, 0, j);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005003 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00005004 if (PyList_Reverse(list) < 0)
5005 goto onError;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005006 return list;
5007
5008 onError:
5009 Py_DECREF(list);
5010 return NULL;
5011}
5012
Guido van Rossumd57fd912000-03-10 22:53:23 +00005013#undef SPLIT_APPEND
5014
5015static
5016PyObject *split(PyUnicodeObject *self,
5017 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005018 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005019{
5020 PyObject *list;
5021
5022 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005023 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005024
5025 list = PyList_New(0);
5026 if (!list)
5027 return NULL;
5028
5029 if (substring == NULL)
5030 return split_whitespace(self,list,maxcount);
5031
5032 else if (substring->length == 1)
5033 return split_char(self,list,substring->str[0],maxcount);
5034
5035 else if (substring->length == 0) {
5036 Py_DECREF(list);
5037 PyErr_SetString(PyExc_ValueError, "empty separator");
5038 return NULL;
5039 }
5040 else
5041 return split_substring(self,list,substring,maxcount);
5042}
5043
Tim Petersced69f82003-09-16 20:30:58 +00005044static
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005045PyObject *rsplit(PyUnicodeObject *self,
5046 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005047 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005048{
5049 PyObject *list;
5050
5051 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005052 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005053
5054 list = PyList_New(0);
5055 if (!list)
5056 return NULL;
5057
5058 if (substring == NULL)
5059 return rsplit_whitespace(self,list,maxcount);
5060
5061 else if (substring->length == 1)
5062 return rsplit_char(self,list,substring->str[0],maxcount);
5063
5064 else if (substring->length == 0) {
5065 Py_DECREF(list);
5066 PyErr_SetString(PyExc_ValueError, "empty separator");
5067 return NULL;
5068 }
5069 else
5070 return rsplit_substring(self,list,substring,maxcount);
5071}
5072
5073static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005074PyObject *replace(PyUnicodeObject *self,
5075 PyUnicodeObject *str1,
5076 PyUnicodeObject *str2,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005077 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005078{
5079 PyUnicodeObject *u;
5080
5081 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005082 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005083
Thomas Wouters477c8d52006-05-27 19:21:47 +00005084 if (str1->length == str2->length) {
5085 /* same length */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005086 Py_ssize_t i;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005087 if (str1->length == 1) {
5088 /* replace characters */
5089 Py_UNICODE u1, u2;
5090 if (!findchar(self->str, self->length, str1->str[0]))
5091 goto nothing;
5092 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
5093 if (!u)
5094 return NULL;
5095 Py_UNICODE_COPY(u->str, self->str, self->length);
5096 u1 = str1->str[0];
5097 u2 = str2->str[0];
5098 for (i = 0; i < u->length; i++)
5099 if (u->str[i] == u1) {
5100 if (--maxcount < 0)
5101 break;
5102 u->str[i] = u2;
5103 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005104 } else {
Thomas Wouters477c8d52006-05-27 19:21:47 +00005105 i = fastsearch(
5106 self->str, self->length, str1->str, str1->length, FAST_SEARCH
Guido van Rossumd57fd912000-03-10 22:53:23 +00005107 );
Thomas Wouters477c8d52006-05-27 19:21:47 +00005108 if (i < 0)
5109 goto nothing;
5110 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
5111 if (!u)
5112 return NULL;
5113 Py_UNICODE_COPY(u->str, self->str, self->length);
5114 while (i <= self->length - str1->length)
5115 if (Py_UNICODE_MATCH(self, i, str1)) {
5116 if (--maxcount < 0)
5117 break;
5118 Py_UNICODE_COPY(u->str+i, str2->str, str2->length);
5119 i += str1->length;
5120 } else
5121 i++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005122 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005123 } else {
Thomas Wouters477c8d52006-05-27 19:21:47 +00005124
5125 Py_ssize_t n, i, j, e;
5126 Py_ssize_t product, new_size, delta;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005127 Py_UNICODE *p;
5128
5129 /* replace strings */
Thomas Wouters477c8d52006-05-27 19:21:47 +00005130 n = stringlib_count(self->str, self->length, str1->str, str1->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005131 if (n > maxcount)
5132 n = maxcount;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005133 if (n == 0)
5134 goto nothing;
5135 /* new_size = self->length + n * (str2->length - str1->length)); */
5136 delta = (str2->length - str1->length);
5137 if (delta == 0) {
5138 new_size = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005139 } else {
Thomas Wouters477c8d52006-05-27 19:21:47 +00005140 product = n * (str2->length - str1->length);
5141 if ((product / (str2->length - str1->length)) != n) {
5142 PyErr_SetString(PyExc_OverflowError,
5143 "replace string is too long");
5144 return NULL;
5145 }
5146 new_size = self->length + product;
5147 if (new_size < 0) {
5148 PyErr_SetString(PyExc_OverflowError,
5149 "replace string is too long");
5150 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005151 }
5152 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00005153 u = _PyUnicode_New(new_size);
5154 if (!u)
5155 return NULL;
5156 i = 0;
5157 p = u->str;
5158 e = self->length - str1->length;
5159 if (str1->length > 0) {
5160 while (n-- > 0) {
5161 /* look for next match */
5162 j = i;
5163 while (j <= e) {
5164 if (Py_UNICODE_MATCH(self, j, str1))
5165 break;
5166 j++;
5167 }
5168 if (j > i) {
5169 if (j > e)
5170 break;
5171 /* copy unchanged part [i:j] */
5172 Py_UNICODE_COPY(p, self->str+i, j-i);
5173 p += j - i;
5174 }
5175 /* copy substitution string */
5176 if (str2->length > 0) {
5177 Py_UNICODE_COPY(p, str2->str, str2->length);
5178 p += str2->length;
5179 }
5180 i = j + str1->length;
5181 }
5182 if (i < self->length)
5183 /* copy tail [i:] */
5184 Py_UNICODE_COPY(p, self->str+i, self->length-i);
5185 } else {
5186 /* interleave */
5187 while (n > 0) {
5188 Py_UNICODE_COPY(p, str2->str, str2->length);
5189 p += str2->length;
5190 if (--n <= 0)
5191 break;
5192 *p++ = self->str[i++];
5193 }
5194 Py_UNICODE_COPY(p, self->str+i, self->length-i);
5195 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005196 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005197 return (PyObject *) u;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005198
5199nothing:
5200 /* nothing to replace; return original string (when possible) */
5201 if (PyUnicode_CheckExact(self)) {
5202 Py_INCREF(self);
5203 return (PyObject *) self;
5204 }
5205 return PyUnicode_FromUnicode(self->str, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005206}
5207
5208/* --- Unicode Object Methods --------------------------------------------- */
5209
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005210PyDoc_STRVAR(title__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005211"S.title() -> unicode\n\
5212\n\
5213Return a titlecased version of S, i.e. words start with title case\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005214characters, all remaining cased characters have lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005215
5216static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005217unicode_title(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005218{
Guido van Rossumd57fd912000-03-10 22:53:23 +00005219 return fixup(self, fixtitle);
5220}
5221
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005222PyDoc_STRVAR(capitalize__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005223"S.capitalize() -> unicode\n\
5224\n\
5225Return a capitalized version of S, i.e. make the first character\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005226have upper case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005227
5228static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005229unicode_capitalize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005230{
Guido van Rossumd57fd912000-03-10 22:53:23 +00005231 return fixup(self, fixcapitalize);
5232}
5233
5234#if 0
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005235PyDoc_STRVAR(capwords__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005236"S.capwords() -> unicode\n\
5237\n\
5238Apply .capitalize() to all words in S and return the result with\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005239normalized whitespace (all whitespace strings are replaced by ' ').");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005240
5241static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005242unicode_capwords(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005243{
5244 PyObject *list;
5245 PyObject *item;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005246 Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005247
Guido van Rossumd57fd912000-03-10 22:53:23 +00005248 /* Split into words */
5249 list = split(self, NULL, -1);
5250 if (!list)
5251 return NULL;
5252
5253 /* Capitalize each word */
5254 for (i = 0; i < PyList_GET_SIZE(list); i++) {
5255 item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
5256 fixcapitalize);
5257 if (item == NULL)
5258 goto onError;
5259 Py_DECREF(PyList_GET_ITEM(list, i));
5260 PyList_SET_ITEM(list, i, item);
5261 }
5262
5263 /* Join the words to form a new string */
5264 item = PyUnicode_Join(NULL, list);
5265
5266onError:
5267 Py_DECREF(list);
5268 return (PyObject *)item;
5269}
5270#endif
5271
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00005272/* Argument converter. Coerces to a single unicode character */
5273
5274static int
5275convert_uc(PyObject *obj, void *addr)
5276{
5277 Py_UNICODE *fillcharloc = (Py_UNICODE *)addr;
5278 PyObject *uniobj;
5279 Py_UNICODE *unistr;
5280
5281 uniobj = PyUnicode_FromObject(obj);
5282 if (uniobj == NULL) {
5283 PyErr_SetString(PyExc_TypeError,
5284 "The fill character cannot be converted to Unicode");
5285 return 0;
5286 }
5287 if (PyUnicode_GET_SIZE(uniobj) != 1) {
5288 PyErr_SetString(PyExc_TypeError,
5289 "The fill character must be exactly one character long");
5290 Py_DECREF(uniobj);
5291 return 0;
5292 }
5293 unistr = PyUnicode_AS_UNICODE(uniobj);
5294 *fillcharloc = unistr[0];
5295 Py_DECREF(uniobj);
5296 return 1;
5297}
5298
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005299PyDoc_STRVAR(center__doc__,
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00005300"S.center(width[, fillchar]) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005301\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00005302Return S centered in a Unicode string of length width. Padding is\n\
5303done using the specified fill character (default is a space)");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005304
5305static PyObject *
5306unicode_center(PyUnicodeObject *self, PyObject *args)
5307{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005308 Py_ssize_t marg, left;
5309 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00005310 Py_UNICODE fillchar = ' ';
Guido van Rossumd57fd912000-03-10 22:53:23 +00005311
Thomas Woutersde017742006-02-16 19:34:37 +00005312 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005313 return NULL;
5314
Tim Peters7a29bd52001-09-12 03:03:31 +00005315 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005316 Py_INCREF(self);
5317 return (PyObject*) self;
5318 }
5319
5320 marg = width - self->length;
5321 left = marg / 2 + (marg & width & 1);
5322
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00005323 return (PyObject*) pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005324}
5325
Marc-André Lemburge5034372000-08-08 08:04:29 +00005326#if 0
5327
5328/* This code should go into some future Unicode collation support
5329 module. The basic comparison should compare ordinals on a naive
Trent Mick20abf572000-08-12 22:14:34 +00005330 basis (this is what Java does and thus JPython too). */
Marc-André Lemburge5034372000-08-08 08:04:29 +00005331
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005332/* speedy UTF-16 code point order comparison */
5333/* gleaned from: */
5334/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
5335
Marc-André Lemburge12896e2000-07-07 17:51:08 +00005336static short utf16Fixup[32] =
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005337{
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005338 0, 0, 0, 0, 0, 0, 0, 0,
Tim Petersced69f82003-09-16 20:30:58 +00005339 0, 0, 0, 0, 0, 0, 0, 0,
5340 0, 0, 0, 0, 0, 0, 0, 0,
Marc-André Lemburge12896e2000-07-07 17:51:08 +00005341 0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005342};
5343
Guido van Rossumd57fd912000-03-10 22:53:23 +00005344static int
5345unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
5346{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005347 Py_ssize_t len1, len2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005348
Guido van Rossumd57fd912000-03-10 22:53:23 +00005349 Py_UNICODE *s1 = str1->str;
5350 Py_UNICODE *s2 = str2->str;
5351
5352 len1 = str1->length;
5353 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00005354
Guido van Rossumd57fd912000-03-10 22:53:23 +00005355 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00005356 Py_UNICODE c1, c2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005357
5358 c1 = *s1++;
5359 c2 = *s2++;
Fredrik Lundh45714e92001-06-26 16:39:36 +00005360
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005361 if (c1 > (1<<11) * 26)
5362 c1 += utf16Fixup[c1>>11];
5363 if (c2 > (1<<11) * 26)
5364 c2 += utf16Fixup[c2>>11];
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005365 /* now c1 and c2 are in UTF-32-compatible order */
Fredrik Lundh45714e92001-06-26 16:39:36 +00005366
5367 if (c1 != c2)
5368 return (c1 < c2) ? -1 : 1;
Tim Petersced69f82003-09-16 20:30:58 +00005369
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005370 len1--; len2--;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005371 }
5372
5373 return (len1 < len2) ? -1 : (len1 != len2);
5374}
5375
Marc-André Lemburge5034372000-08-08 08:04:29 +00005376#else
5377
5378static int
5379unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
5380{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005381 register Py_ssize_t len1, len2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00005382
5383 Py_UNICODE *s1 = str1->str;
5384 Py_UNICODE *s2 = str2->str;
5385
5386 len1 = str1->length;
5387 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00005388
Marc-André Lemburge5034372000-08-08 08:04:29 +00005389 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00005390 Py_UNICODE c1, c2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00005391
Fredrik Lundh45714e92001-06-26 16:39:36 +00005392 c1 = *s1++;
5393 c2 = *s2++;
5394
5395 if (c1 != c2)
5396 return (c1 < c2) ? -1 : 1;
5397
Marc-André Lemburge5034372000-08-08 08:04:29 +00005398 len1--; len2--;
5399 }
5400
5401 return (len1 < len2) ? -1 : (len1 != len2);
5402}
5403
5404#endif
5405
Guido van Rossumd57fd912000-03-10 22:53:23 +00005406int PyUnicode_Compare(PyObject *left,
5407 PyObject *right)
5408{
Guido van Rossum09dc34f2007-05-04 04:17:33 +00005409 if (PyUnicode_Check(left) && PyUnicode_Check(right))
5410 return unicode_compare((PyUnicodeObject *)left,
5411 (PyUnicodeObject *)right);
5412 if ((PyString_Check(left) && PyUnicode_Check(right)) ||
5413 (PyUnicode_Check(left) && PyString_Check(right))) {
5414 if (PyUnicode_Check(left))
5415 left = _PyUnicode_AsDefaultEncodedString(left, NULL);
5416 if (PyUnicode_Check(right))
5417 right = _PyUnicode_AsDefaultEncodedString(right, NULL);
5418 assert(PyString_Check(left));
5419 assert(PyString_Check(right));
5420 return PyObject_Compare(left, right);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005421 }
Guido van Rossum09dc34f2007-05-04 04:17:33 +00005422 PyErr_Format(PyExc_TypeError,
5423 "Can't compare %.100s and %.100s",
5424 left->ob_type->tp_name,
5425 right->ob_type->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005426 return -1;
5427}
5428
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00005429PyObject *PyUnicode_RichCompare(PyObject *left,
5430 PyObject *right,
5431 int op)
5432{
5433 int result;
5434
5435 result = PyUnicode_Compare(left, right);
5436 if (result == -1 && PyErr_Occurred())
5437 goto onError;
5438
5439 /* Convert the return value to a Boolean */
5440 switch (op) {
5441 case Py_EQ:
5442 result = (result == 0);
5443 break;
5444 case Py_NE:
5445 result = (result != 0);
5446 break;
5447 case Py_LE:
5448 result = (result <= 0);
5449 break;
5450 case Py_GE:
5451 result = (result >= 0);
5452 break;
5453 case Py_LT:
5454 result = (result == -1);
5455 break;
5456 case Py_GT:
5457 result = (result == 1);
5458 break;
5459 }
5460 return PyBool_FromLong(result);
5461
5462 onError:
5463
5464 /* Standard case
5465
5466 Type errors mean that PyUnicode_FromObject() could not convert
5467 one of the arguments (usually the right hand side) to Unicode,
5468 ie. we can't handle the comparison request. However, it is
5469 possible that the other object knows a comparison method, which
5470 is why we return Py_NotImplemented to give the other object a
5471 chance.
5472
5473 */
5474 if (PyErr_ExceptionMatches(PyExc_TypeError)) {
5475 PyErr_Clear();
5476 Py_INCREF(Py_NotImplemented);
5477 return Py_NotImplemented;
5478 }
5479 if (op != Py_EQ && op != Py_NE)
5480 return NULL;
5481
5482 /* Equality comparison.
5483
5484 This is a special case: we silence any PyExc_UnicodeDecodeError
5485 and instead turn it into a PyErr_UnicodeWarning.
5486
5487 */
5488 if (!PyErr_ExceptionMatches(PyExc_UnicodeDecodeError))
5489 return NULL;
5490 PyErr_Clear();
5491 if (PyErr_Warn(PyExc_UnicodeWarning,
5492 (op == Py_EQ) ?
5493 "Unicode equal comparison "
5494 "failed to convert both arguments to Unicode - "
5495 "interpreting them as being unequal" :
5496 "Unicode unequal comparison "
5497 "failed to convert both arguments to Unicode - "
5498 "interpreting them as being unequal"
5499 ) < 0)
5500 return NULL;
5501 result = (op == Py_NE);
5502 return PyBool_FromLong(result);
5503}
5504
Guido van Rossum403d68b2000-03-13 15:55:09 +00005505int PyUnicode_Contains(PyObject *container,
5506 PyObject *element)
5507{
Thomas Wouters477c8d52006-05-27 19:21:47 +00005508 PyObject *str, *sub;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005509 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00005510
5511 /* Coerce the two arguments */
Thomas Wouters477c8d52006-05-27 19:21:47 +00005512 sub = PyUnicode_FromObject(element);
5513 if (!sub) {
Marc-André Lemburg7c014682000-06-28 08:11:47 +00005514 PyErr_SetString(PyExc_TypeError,
Barry Warsaw817918c2002-08-06 16:58:21 +00005515 "'in <string>' requires string as left operand");
Thomas Wouters477c8d52006-05-27 19:21:47 +00005516 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +00005517 }
5518
Thomas Wouters477c8d52006-05-27 19:21:47 +00005519 str = PyUnicode_FromObject(container);
5520 if (!str) {
5521 Py_DECREF(sub);
5522 return -1;
5523 }
5524
5525 result = stringlib_contains_obj(str, sub);
5526
5527 Py_DECREF(str);
5528 Py_DECREF(sub);
5529
Guido van Rossum403d68b2000-03-13 15:55:09 +00005530 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00005531}
5532
Guido van Rossumd57fd912000-03-10 22:53:23 +00005533/* Concat to string or Unicode object giving a new Unicode object. */
5534
5535PyObject *PyUnicode_Concat(PyObject *left,
5536 PyObject *right)
5537{
5538 PyUnicodeObject *u = NULL, *v = NULL, *w;
5539
Guido van Rossum84d79dd2007-04-13 02:23:57 +00005540 if (PyBytes_Check(left) || PyBytes_Check(right))
5541 return PyBytes_Concat(left, right);
5542
Guido van Rossumd57fd912000-03-10 22:53:23 +00005543 /* Coerce the two arguments */
5544 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
5545 if (u == NULL)
5546 goto onError;
5547 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
5548 if (v == NULL)
5549 goto onError;
5550
5551 /* Shortcuts */
5552 if (v == unicode_empty) {
5553 Py_DECREF(v);
5554 return (PyObject *)u;
5555 }
5556 if (u == unicode_empty) {
5557 Py_DECREF(u);
5558 return (PyObject *)v;
5559 }
5560
5561 /* Concat the two Unicode strings */
5562 w = _PyUnicode_New(u->length + v->length);
5563 if (w == NULL)
5564 goto onError;
5565 Py_UNICODE_COPY(w->str, u->str, u->length);
5566 Py_UNICODE_COPY(w->str + u->length, v->str, v->length);
5567
5568 Py_DECREF(u);
5569 Py_DECREF(v);
5570 return (PyObject *)w;
5571
5572onError:
5573 Py_XDECREF(u);
5574 Py_XDECREF(v);
5575 return NULL;
5576}
5577
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005578PyDoc_STRVAR(count__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005579"S.count(sub[, start[, end]]) -> int\n\
5580\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00005581Return the number of non-overlapping occurrences of substring sub in\n\
5582Unicode string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005583interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005584
5585static PyObject *
5586unicode_count(PyUnicodeObject *self, PyObject *args)
5587{
5588 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005589 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005590 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005591 PyObject *result;
5592
Guido van Rossumb8872e62000-05-09 14:14:27 +00005593 if (!PyArg_ParseTuple(args, "O|O&O&:count", &substring,
5594 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005595 return NULL;
5596
5597 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Thomas Wouters477c8d52006-05-27 19:21:47 +00005598 (PyObject *)substring);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005599 if (substring == NULL)
5600 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +00005601
Thomas Wouters477c8d52006-05-27 19:21:47 +00005602 FIX_START_END(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005603
Thomas Wouters477c8d52006-05-27 19:21:47 +00005604 result = PyInt_FromSsize_t(
5605 stringlib_count(self->str + start, end - start,
5606 substring->str, substring->length)
5607 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00005608
5609 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00005610
Guido van Rossumd57fd912000-03-10 22:53:23 +00005611 return result;
5612}
5613
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005614PyDoc_STRVAR(encode__doc__,
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00005615"S.encode([encoding[,errors]]) -> string or unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005616\n\
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00005617Encodes S using the codec registered for encoding. encoding defaults\n\
5618to the default encoding. errors may be given to set a different error\n\
Fred Drakee4315f52000-05-09 19:53:39 +00005619handling scheme. Default is 'strict' meaning that encoding errors raise\n\
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005620a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
5621'xmlcharrefreplace' as well as any other name registered with\n\
5622codecs.register_error that can handle UnicodeEncodeErrors.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005623
5624static PyObject *
5625unicode_encode(PyUnicodeObject *self, PyObject *args)
5626{
5627 char *encoding = NULL;
5628 char *errors = NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00005629 PyObject *v;
5630
Guido van Rossumd57fd912000-03-10 22:53:23 +00005631 if (!PyArg_ParseTuple(args, "|ss:encode", &encoding, &errors))
5632 return NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00005633 v = PyUnicode_AsEncodedObject((PyObject *)self, encoding, errors);
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00005634 if (v == NULL)
5635 goto onError;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00005636 if (!PyBytes_Check(v)) {
Guido van Rossum4355a472007-05-04 05:00:04 +00005637 if (PyString_Check(v)) {
5638 /* Old codec, turn it into bytes */
5639 PyObject *b = PyBytes_FromObject(v);
5640 Py_DECREF(v);
5641 return b;
5642 }
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00005643 PyErr_Format(PyExc_TypeError,
Guido van Rossumf15a29f2007-05-04 00:41:39 +00005644 "encoder did not return a bytes object "
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00005645 "(type=%.400s)",
5646 v->ob_type->tp_name);
5647 Py_DECREF(v);
5648 return NULL;
5649 }
5650 return v;
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00005651
5652 onError:
5653 return NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00005654}
5655
5656PyDoc_STRVAR(decode__doc__,
5657"S.decode([encoding[,errors]]) -> string or unicode\n\
5658\n\
5659Decodes S using the codec registered for encoding. encoding defaults\n\
5660to the default encoding. errors may be given to set a different error\n\
5661handling scheme. Default is 'strict' meaning that encoding errors raise\n\
5662a UnicodeDecodeError. Other possible values are 'ignore' and 'replace'\n\
5663as well as any other name registerd with codecs.register_error that is\n\
5664able to handle UnicodeDecodeErrors.");
5665
5666static PyObject *
Marc-André Lemburg126b44c2004-07-10 12:04:20 +00005667unicode_decode(PyUnicodeObject *self, PyObject *args)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00005668{
5669 char *encoding = NULL;
5670 char *errors = NULL;
5671 PyObject *v;
5672
5673 if (!PyArg_ParseTuple(args, "|ss:decode", &encoding, &errors))
5674 return NULL;
5675 v = PyUnicode_AsDecodedObject((PyObject *)self, encoding, errors);
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00005676 if (v == NULL)
5677 goto onError;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00005678 if (!PyString_Check(v) && !PyUnicode_Check(v)) {
5679 PyErr_Format(PyExc_TypeError,
5680 "decoder did not return a string/unicode object "
5681 "(type=%.400s)",
5682 v->ob_type->tp_name);
5683 Py_DECREF(v);
5684 return NULL;
5685 }
5686 return v;
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00005687
5688 onError:
5689 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005690}
5691
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005692PyDoc_STRVAR(expandtabs__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005693"S.expandtabs([tabsize]) -> unicode\n\
5694\n\
5695Return a copy of S where all tab characters are expanded using spaces.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005696If tabsize is not given, a tab size of 8 characters is assumed.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005697
5698static PyObject*
5699unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
5700{
5701 Py_UNICODE *e;
5702 Py_UNICODE *p;
5703 Py_UNICODE *q;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005704 Py_ssize_t i, j;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005705 PyUnicodeObject *u;
5706 int tabsize = 8;
5707
5708 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
5709 return NULL;
5710
Thomas Wouters7e474022000-07-16 12:04:32 +00005711 /* First pass: determine size of output string */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005712 i = j = 0;
5713 e = self->str + self->length;
5714 for (p = self->str; p < e; p++)
5715 if (*p == '\t') {
5716 if (tabsize > 0)
5717 j += tabsize - (j % tabsize);
5718 }
5719 else {
5720 j++;
5721 if (*p == '\n' || *p == '\r') {
5722 i += j;
5723 j = 0;
5724 }
5725 }
5726
5727 /* Second pass: create output string and fill it */
5728 u = _PyUnicode_New(i + j);
5729 if (!u)
5730 return NULL;
5731
5732 j = 0;
5733 q = u->str;
5734
5735 for (p = self->str; p < e; p++)
5736 if (*p == '\t') {
5737 if (tabsize > 0) {
5738 i = tabsize - (j % tabsize);
5739 j += i;
5740 while (i--)
5741 *q++ = ' ';
5742 }
5743 }
5744 else {
5745 j++;
5746 *q++ = *p;
5747 if (*p == '\n' || *p == '\r')
5748 j = 0;
5749 }
5750
5751 return (PyObject*) u;
5752}
5753
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005754PyDoc_STRVAR(find__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005755"S.find(sub [,start [,end]]) -> int\n\
5756\n\
5757Return the lowest index in S where substring sub is found,\n\
5758such that sub is contained within s[start,end]. Optional\n\
5759arguments start and end are interpreted as in slice notation.\n\
5760\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005761Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005762
5763static PyObject *
5764unicode_find(PyUnicodeObject *self, PyObject *args)
5765{
Thomas Wouters477c8d52006-05-27 19:21:47 +00005766 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005767 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005768 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005769 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005770
Guido van Rossumb8872e62000-05-09 14:14:27 +00005771 if (!PyArg_ParseTuple(args, "O|O&O&:find", &substring,
5772 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005773 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005774 substring = PyUnicode_FromObject(substring);
5775 if (!substring)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005776 return NULL;
5777
Thomas Wouters477c8d52006-05-27 19:21:47 +00005778 result = stringlib_find_slice(
5779 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
5780 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
5781 start, end
5782 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00005783
5784 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00005785
5786 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005787}
5788
5789static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +00005790unicode_getitem(PyUnicodeObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005791{
5792 if (index < 0 || index >= self->length) {
5793 PyErr_SetString(PyExc_IndexError, "string index out of range");
5794 return NULL;
5795 }
5796
5797 return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1);
5798}
5799
5800static long
Guido van Rossum09dc34f2007-05-04 04:17:33 +00005801unicode_hash(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005802{
Guido van Rossum09dc34f2007-05-04 04:17:33 +00005803 /* Since Unicode objects compare equal to their UTF-8 string
5804 counterparts, we hash the UTF-8 string. */
5805 PyObject *v = _PyUnicode_AsDefaultEncodedString(self, NULL);
5806 return PyObject_Hash(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005807}
5808
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005809PyDoc_STRVAR(index__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005810"S.index(sub [,start [,end]]) -> int\n\
5811\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005812Like S.find() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005813
5814static PyObject *
5815unicode_index(PyUnicodeObject *self, PyObject *args)
5816{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005817 Py_ssize_t result;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005818 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005819 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005820 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005821
Guido van Rossumb8872e62000-05-09 14:14:27 +00005822 if (!PyArg_ParseTuple(args, "O|O&O&:index", &substring,
5823 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005824 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005825 substring = PyUnicode_FromObject(substring);
5826 if (!substring)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005827 return NULL;
5828
Thomas Wouters477c8d52006-05-27 19:21:47 +00005829 result = stringlib_find_slice(
5830 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
5831 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
5832 start, end
5833 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00005834
5835 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00005836
Guido van Rossumd57fd912000-03-10 22:53:23 +00005837 if (result < 0) {
5838 PyErr_SetString(PyExc_ValueError, "substring not found");
5839 return NULL;
5840 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00005841
Martin v. Löwis18e16552006-02-15 17:27:45 +00005842 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005843}
5844
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005845PyDoc_STRVAR(islower__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005846"S.islower() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005847\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00005848Return True if all cased characters in S are lowercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005849at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005850
5851static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005852unicode_islower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005853{
5854 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5855 register const Py_UNICODE *e;
5856 int cased;
5857
Guido van Rossumd57fd912000-03-10 22:53:23 +00005858 /* Shortcut for single character strings */
5859 if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005860 return PyBool_FromLong(Py_UNICODE_ISLOWER(*p));
Guido van Rossumd57fd912000-03-10 22:53:23 +00005861
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005862 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00005863 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005864 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005865
Guido van Rossumd57fd912000-03-10 22:53:23 +00005866 e = p + PyUnicode_GET_SIZE(self);
5867 cased = 0;
5868 for (; p < e; p++) {
5869 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00005870
Guido van Rossumd57fd912000-03-10 22:53:23 +00005871 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005872 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005873 else if (!cased && Py_UNICODE_ISLOWER(ch))
5874 cased = 1;
5875 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00005876 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005877}
5878
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005879PyDoc_STRVAR(isupper__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005880"S.isupper() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005881\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00005882Return True if all cased characters in S are uppercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005883at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005884
5885static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005886unicode_isupper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005887{
5888 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5889 register const Py_UNICODE *e;
5890 int cased;
5891
Guido van Rossumd57fd912000-03-10 22:53:23 +00005892 /* Shortcut for single character strings */
5893 if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005894 return PyBool_FromLong(Py_UNICODE_ISUPPER(*p) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005895
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005896 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00005897 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005898 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005899
Guido van Rossumd57fd912000-03-10 22:53:23 +00005900 e = p + PyUnicode_GET_SIZE(self);
5901 cased = 0;
5902 for (; p < e; p++) {
5903 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00005904
Guido van Rossumd57fd912000-03-10 22:53:23 +00005905 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005906 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005907 else if (!cased && Py_UNICODE_ISUPPER(ch))
5908 cased = 1;
5909 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00005910 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005911}
5912
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005913PyDoc_STRVAR(istitle__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005914"S.istitle() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005915\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00005916Return True if S is a titlecased string and there is at least one\n\
5917character in S, i.e. upper- and titlecase characters may only\n\
5918follow uncased characters and lowercase characters only cased ones.\n\
5919Return False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005920
5921static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005922unicode_istitle(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005923{
5924 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5925 register const Py_UNICODE *e;
5926 int cased, previous_is_cased;
5927
Guido van Rossumd57fd912000-03-10 22:53:23 +00005928 /* Shortcut for single character strings */
5929 if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005930 return PyBool_FromLong((Py_UNICODE_ISTITLE(*p) != 0) ||
5931 (Py_UNICODE_ISUPPER(*p) != 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +00005932
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005933 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00005934 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005935 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005936
Guido van Rossumd57fd912000-03-10 22:53:23 +00005937 e = p + PyUnicode_GET_SIZE(self);
5938 cased = 0;
5939 previous_is_cased = 0;
5940 for (; p < e; p++) {
5941 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00005942
Guido van Rossumd57fd912000-03-10 22:53:23 +00005943 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
5944 if (previous_is_cased)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005945 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005946 previous_is_cased = 1;
5947 cased = 1;
5948 }
5949 else if (Py_UNICODE_ISLOWER(ch)) {
5950 if (!previous_is_cased)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005951 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005952 previous_is_cased = 1;
5953 cased = 1;
5954 }
5955 else
5956 previous_is_cased = 0;
5957 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00005958 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005959}
5960
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005961PyDoc_STRVAR(isspace__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005962"S.isspace() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005963\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00005964Return True if all characters in S are whitespace\n\
5965and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005966
5967static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005968unicode_isspace(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005969{
5970 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5971 register const Py_UNICODE *e;
5972
Guido van Rossumd57fd912000-03-10 22:53:23 +00005973 /* Shortcut for single character strings */
5974 if (PyUnicode_GET_SIZE(self) == 1 &&
5975 Py_UNICODE_ISSPACE(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005976 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005977
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005978 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00005979 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005980 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005981
Guido van Rossumd57fd912000-03-10 22:53:23 +00005982 e = p + PyUnicode_GET_SIZE(self);
5983 for (; p < e; p++) {
5984 if (!Py_UNICODE_ISSPACE(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005985 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005986 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00005987 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005988}
5989
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005990PyDoc_STRVAR(isalpha__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005991"S.isalpha() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005992\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00005993Return True if all characters in S are alphabetic\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005994and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005995
5996static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005997unicode_isalpha(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005998{
5999 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6000 register const Py_UNICODE *e;
6001
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006002 /* Shortcut for single character strings */
6003 if (PyUnicode_GET_SIZE(self) == 1 &&
6004 Py_UNICODE_ISALPHA(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006005 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006006
6007 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006008 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006009 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006010
6011 e = p + PyUnicode_GET_SIZE(self);
6012 for (; p < e; p++) {
6013 if (!Py_UNICODE_ISALPHA(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006014 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006015 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006016 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006017}
6018
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006019PyDoc_STRVAR(isalnum__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006020"S.isalnum() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006021\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006022Return True if all characters in S are alphanumeric\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006023and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006024
6025static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006026unicode_isalnum(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006027{
6028 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6029 register const Py_UNICODE *e;
6030
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006031 /* Shortcut for single character strings */
6032 if (PyUnicode_GET_SIZE(self) == 1 &&
6033 Py_UNICODE_ISALNUM(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006034 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006035
6036 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006037 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006038 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006039
6040 e = p + PyUnicode_GET_SIZE(self);
6041 for (; p < e; p++) {
6042 if (!Py_UNICODE_ISALNUM(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006043 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006044 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006045 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006046}
6047
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006048PyDoc_STRVAR(isdecimal__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006049"S.isdecimal() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006050\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00006051Return True if there are only decimal characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006052False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006053
6054static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006055unicode_isdecimal(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006056{
6057 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6058 register const Py_UNICODE *e;
6059
Guido van Rossumd57fd912000-03-10 22:53:23 +00006060 /* Shortcut for single character strings */
6061 if (PyUnicode_GET_SIZE(self) == 1 &&
6062 Py_UNICODE_ISDECIMAL(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006063 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006064
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006065 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006066 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006067 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006068
Guido van Rossumd57fd912000-03-10 22:53:23 +00006069 e = p + PyUnicode_GET_SIZE(self);
6070 for (; p < e; p++) {
6071 if (!Py_UNICODE_ISDECIMAL(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006072 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006073 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006074 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006075}
6076
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006077PyDoc_STRVAR(isdigit__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006078"S.isdigit() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006079\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006080Return True if all characters in S are digits\n\
6081and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006082
6083static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006084unicode_isdigit(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006085{
6086 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6087 register const Py_UNICODE *e;
6088
Guido van Rossumd57fd912000-03-10 22:53:23 +00006089 /* Shortcut for single character strings */
6090 if (PyUnicode_GET_SIZE(self) == 1 &&
6091 Py_UNICODE_ISDIGIT(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006092 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006093
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006094 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006095 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006096 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006097
Guido van Rossumd57fd912000-03-10 22:53:23 +00006098 e = p + PyUnicode_GET_SIZE(self);
6099 for (; p < e; p++) {
6100 if (!Py_UNICODE_ISDIGIT(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006101 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006102 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006103 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006104}
6105
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006106PyDoc_STRVAR(isnumeric__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006107"S.isnumeric() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006108\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00006109Return True if there are only numeric characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006110False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006111
6112static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006113unicode_isnumeric(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006114{
6115 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6116 register const Py_UNICODE *e;
6117
Guido van Rossumd57fd912000-03-10 22:53:23 +00006118 /* Shortcut for single character strings */
6119 if (PyUnicode_GET_SIZE(self) == 1 &&
6120 Py_UNICODE_ISNUMERIC(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006121 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006122
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006123 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006124 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006125 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006126
Guido van Rossumd57fd912000-03-10 22:53:23 +00006127 e = p + PyUnicode_GET_SIZE(self);
6128 for (; p < e; p++) {
6129 if (!Py_UNICODE_ISNUMERIC(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006130 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006131 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006132 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006133}
6134
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006135PyDoc_STRVAR(join__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006136"S.join(sequence) -> unicode\n\
6137\n\
6138Return a string which is the concatenation of the strings in the\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006139sequence. The separator between elements is S.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006140
6141static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006142unicode_join(PyObject *self, PyObject *data)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006143{
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006144 return PyUnicode_Join(self, data);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006145}
6146
Martin v. Löwis18e16552006-02-15 17:27:45 +00006147static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +00006148unicode_length(PyUnicodeObject *self)
6149{
6150 return self->length;
6151}
6152
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006153PyDoc_STRVAR(ljust__doc__,
Hye-Shik Chang974ed7c2004-06-02 16:49:17 +00006154"S.ljust(width[, fillchar]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006155\n\
6156Return S left justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006157done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006158
6159static PyObject *
6160unicode_ljust(PyUnicodeObject *self, PyObject *args)
6161{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006162 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006163 Py_UNICODE fillchar = ' ';
6164
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006165 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006166 return NULL;
6167
Tim Peters7a29bd52001-09-12 03:03:31 +00006168 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006169 Py_INCREF(self);
6170 return (PyObject*) self;
6171 }
6172
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006173 return (PyObject*) pad(self, 0, width - self->length, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006174}
6175
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006176PyDoc_STRVAR(lower__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006177"S.lower() -> unicode\n\
6178\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006179Return a copy of the string S converted to lowercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006180
6181static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006182unicode_lower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006183{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006184 return fixup(self, fixlower);
6185}
6186
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006187#define LEFTSTRIP 0
6188#define RIGHTSTRIP 1
6189#define BOTHSTRIP 2
6190
6191/* Arrays indexed by above */
6192static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
6193
6194#define STRIPNAME(i) (stripformat[i]+3)
6195
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006196/* externally visible for str.strip(unicode) */
6197PyObject *
6198_PyUnicode_XStrip(PyUnicodeObject *self, int striptype, PyObject *sepobj)
6199{
6200 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
Martin v. Löwis18e16552006-02-15 17:27:45 +00006201 Py_ssize_t len = PyUnicode_GET_SIZE(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006202 Py_UNICODE *sep = PyUnicode_AS_UNICODE(sepobj);
Martin v. Löwis18e16552006-02-15 17:27:45 +00006203 Py_ssize_t seplen = PyUnicode_GET_SIZE(sepobj);
6204 Py_ssize_t i, j;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006205
Thomas Wouters477c8d52006-05-27 19:21:47 +00006206 BLOOM_MASK sepmask = make_bloom_mask(sep, seplen);
6207
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006208 i = 0;
6209 if (striptype != RIGHTSTRIP) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00006210 while (i < len && BLOOM_MEMBER(sepmask, s[i], sep, seplen)) {
6211 i++;
6212 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006213 }
6214
6215 j = len;
6216 if (striptype != LEFTSTRIP) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00006217 do {
6218 j--;
6219 } while (j >= i && BLOOM_MEMBER(sepmask, s[j], sep, seplen));
6220 j++;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006221 }
6222
6223 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00006224 Py_INCREF(self);
6225 return (PyObject*)self;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006226 }
6227 else
Thomas Wouters477c8d52006-05-27 19:21:47 +00006228 return PyUnicode_FromUnicode(s+i, j-i);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006229}
6230
Guido van Rossumd57fd912000-03-10 22:53:23 +00006231
6232static PyObject *
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006233do_strip(PyUnicodeObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006234{
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006235 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
Martin v. Löwis18e16552006-02-15 17:27:45 +00006236 Py_ssize_t len = PyUnicode_GET_SIZE(self), i, j;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006237
6238 i = 0;
6239 if (striptype != RIGHTSTRIP) {
6240 while (i < len && Py_UNICODE_ISSPACE(s[i])) {
6241 i++;
6242 }
6243 }
6244
6245 j = len;
6246 if (striptype != LEFTSTRIP) {
6247 do {
6248 j--;
6249 } while (j >= i && Py_UNICODE_ISSPACE(s[j]));
6250 j++;
6251 }
6252
6253 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
6254 Py_INCREF(self);
6255 return (PyObject*)self;
6256 }
6257 else
6258 return PyUnicode_FromUnicode(s+i, j-i);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006259}
6260
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006261
6262static PyObject *
6263do_argstrip(PyUnicodeObject *self, int striptype, PyObject *args)
6264{
6265 PyObject *sep = NULL;
6266
6267 if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
6268 return NULL;
6269
6270 if (sep != NULL && sep != Py_None) {
6271 if (PyUnicode_Check(sep))
6272 return _PyUnicode_XStrip(self, striptype, sep);
6273 else if (PyString_Check(sep)) {
6274 PyObject *res;
6275 sep = PyUnicode_FromObject(sep);
6276 if (sep==NULL)
6277 return NULL;
6278 res = _PyUnicode_XStrip(self, striptype, sep);
6279 Py_DECREF(sep);
6280 return res;
6281 }
6282 else {
6283 PyErr_Format(PyExc_TypeError,
6284 "%s arg must be None, unicode or str",
6285 STRIPNAME(striptype));
6286 return NULL;
6287 }
6288 }
6289
6290 return do_strip(self, striptype);
6291}
6292
6293
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006294PyDoc_STRVAR(strip__doc__,
Neal Norwitzffe33b72003-04-10 22:35:32 +00006295"S.strip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006296\n\
6297Return a copy of the string S with leading and trailing\n\
6298whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00006299If chars is given and not None, remove characters in chars instead.\n\
6300If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006301
6302static PyObject *
6303unicode_strip(PyUnicodeObject *self, PyObject *args)
6304{
6305 if (PyTuple_GET_SIZE(args) == 0)
6306 return do_strip(self, BOTHSTRIP); /* Common case */
6307 else
6308 return do_argstrip(self, BOTHSTRIP, args);
6309}
6310
6311
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006312PyDoc_STRVAR(lstrip__doc__,
Neal Norwitzffe33b72003-04-10 22:35:32 +00006313"S.lstrip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006314\n\
6315Return a copy of the string S with leading whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00006316If chars is given and not None, remove characters in chars instead.\n\
6317If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006318
6319static PyObject *
6320unicode_lstrip(PyUnicodeObject *self, PyObject *args)
6321{
6322 if (PyTuple_GET_SIZE(args) == 0)
6323 return do_strip(self, LEFTSTRIP); /* Common case */
6324 else
6325 return do_argstrip(self, LEFTSTRIP, args);
6326}
6327
6328
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006329PyDoc_STRVAR(rstrip__doc__,
Neal Norwitzffe33b72003-04-10 22:35:32 +00006330"S.rstrip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006331\n\
6332Return a copy of the string S with trailing whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00006333If chars is given and not None, remove characters in chars instead.\n\
6334If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006335
6336static PyObject *
6337unicode_rstrip(PyUnicodeObject *self, PyObject *args)
6338{
6339 if (PyTuple_GET_SIZE(args) == 0)
6340 return do_strip(self, RIGHTSTRIP); /* Common case */
6341 else
6342 return do_argstrip(self, RIGHTSTRIP, args);
6343}
6344
6345
Guido van Rossumd57fd912000-03-10 22:53:23 +00006346static PyObject*
Martin v. Löwis18e16552006-02-15 17:27:45 +00006347unicode_repeat(PyUnicodeObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006348{
6349 PyUnicodeObject *u;
6350 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006351 Py_ssize_t nchars;
Tim Peters8f422462000-09-09 06:13:41 +00006352 size_t nbytes;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006353
6354 if (len < 0)
6355 len = 0;
6356
Tim Peters7a29bd52001-09-12 03:03:31 +00006357 if (len == 1 && PyUnicode_CheckExact(str)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006358 /* no repeat, return original string */
6359 Py_INCREF(str);
6360 return (PyObject*) str;
6361 }
Tim Peters8f422462000-09-09 06:13:41 +00006362
6363 /* ensure # of chars needed doesn't overflow int and # of bytes
6364 * needed doesn't overflow size_t
6365 */
6366 nchars = len * str->length;
6367 if (len && nchars / len != str->length) {
6368 PyErr_SetString(PyExc_OverflowError,
6369 "repeated string is too long");
6370 return NULL;
6371 }
6372 nbytes = (nchars + 1) * sizeof(Py_UNICODE);
6373 if (nbytes / sizeof(Py_UNICODE) != (size_t)(nchars + 1)) {
6374 PyErr_SetString(PyExc_OverflowError,
6375 "repeated string is too long");
6376 return NULL;
6377 }
6378 u = _PyUnicode_New(nchars);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006379 if (!u)
6380 return NULL;
6381
6382 p = u->str;
6383
Thomas Wouters477c8d52006-05-27 19:21:47 +00006384 if (str->length == 1 && len > 0) {
6385 Py_UNICODE_FILL(p, str->str[0], len);
6386 } else {
6387 Py_ssize_t done = 0; /* number of characters copied this far */
6388 if (done < nchars) {
6389 Py_UNICODE_COPY(p, str->str, str->length);
6390 done = str->length;
6391 }
6392 while (done < nchars) {
6393 int n = (done <= nchars-done) ? done : nchars-done;
6394 Py_UNICODE_COPY(p+done, p, n);
6395 done += n;
6396 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006397 }
6398
6399 return (PyObject*) u;
6400}
6401
6402PyObject *PyUnicode_Replace(PyObject *obj,
6403 PyObject *subobj,
6404 PyObject *replobj,
Martin v. Löwis18e16552006-02-15 17:27:45 +00006405 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006406{
6407 PyObject *self;
6408 PyObject *str1;
6409 PyObject *str2;
6410 PyObject *result;
6411
6412 self = PyUnicode_FromObject(obj);
6413 if (self == NULL)
6414 return NULL;
6415 str1 = PyUnicode_FromObject(subobj);
6416 if (str1 == NULL) {
6417 Py_DECREF(self);
6418 return NULL;
6419 }
6420 str2 = PyUnicode_FromObject(replobj);
6421 if (str2 == NULL) {
6422 Py_DECREF(self);
6423 Py_DECREF(str1);
6424 return NULL;
6425 }
Tim Petersced69f82003-09-16 20:30:58 +00006426 result = replace((PyUnicodeObject *)self,
6427 (PyUnicodeObject *)str1,
6428 (PyUnicodeObject *)str2,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006429 maxcount);
6430 Py_DECREF(self);
6431 Py_DECREF(str1);
6432 Py_DECREF(str2);
6433 return result;
6434}
6435
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006436PyDoc_STRVAR(replace__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006437"S.replace (old, new[, maxsplit]) -> unicode\n\
6438\n\
6439Return a copy of S with all occurrences of substring\n\
6440old replaced by new. If the optional argument maxsplit is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006441given, only the first maxsplit occurrences are replaced.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006442
6443static PyObject*
6444unicode_replace(PyUnicodeObject *self, PyObject *args)
6445{
6446 PyUnicodeObject *str1;
6447 PyUnicodeObject *str2;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006448 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006449 PyObject *result;
6450
Martin v. Löwis18e16552006-02-15 17:27:45 +00006451 if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006452 return NULL;
6453 str1 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str1);
6454 if (str1 == NULL)
6455 return NULL;
6456 str2 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str2);
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00006457 if (str2 == NULL) {
6458 Py_DECREF(str1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006459 return NULL;
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00006460 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006461
6462 result = replace(self, str1, str2, maxcount);
6463
6464 Py_DECREF(str1);
6465 Py_DECREF(str2);
6466 return result;
6467}
6468
6469static
6470PyObject *unicode_repr(PyObject *unicode)
6471{
6472 return unicodeescape_string(PyUnicode_AS_UNICODE(unicode),
6473 PyUnicode_GET_SIZE(unicode),
6474 1);
6475}
6476
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006477PyDoc_STRVAR(rfind__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006478"S.rfind(sub [,start [,end]]) -> int\n\
6479\n\
6480Return the highest index in S where substring sub is found,\n\
6481such that sub is contained within s[start,end]. Optional\n\
6482arguments start and end are interpreted as in slice notation.\n\
6483\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006484Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006485
6486static PyObject *
6487unicode_rfind(PyUnicodeObject *self, PyObject *args)
6488{
Thomas Wouters477c8d52006-05-27 19:21:47 +00006489 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006490 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006491 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006492 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006493
Guido van Rossumb8872e62000-05-09 14:14:27 +00006494 if (!PyArg_ParseTuple(args, "O|O&O&:rfind", &substring,
6495 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006496 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006497 substring = PyUnicode_FromObject(substring);
6498 if (!substring)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006499 return NULL;
6500
Thomas Wouters477c8d52006-05-27 19:21:47 +00006501 result = stringlib_rfind_slice(
6502 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
6503 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
6504 start, end
6505 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006506
6507 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00006508
6509 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006510}
6511
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006512PyDoc_STRVAR(rindex__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006513"S.rindex(sub [,start [,end]]) -> int\n\
6514\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006515Like S.rfind() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006516
6517static PyObject *
6518unicode_rindex(PyUnicodeObject *self, PyObject *args)
6519{
Thomas Wouters477c8d52006-05-27 19:21:47 +00006520 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006521 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006522 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006523 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006524
Guido van Rossumb8872e62000-05-09 14:14:27 +00006525 if (!PyArg_ParseTuple(args, "O|O&O&:rindex", &substring,
6526 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006527 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006528 substring = PyUnicode_FromObject(substring);
6529 if (!substring)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006530 return NULL;
6531
Thomas Wouters477c8d52006-05-27 19:21:47 +00006532 result = stringlib_rfind_slice(
6533 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
6534 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
6535 start, end
6536 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006537
6538 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00006539
Guido van Rossumd57fd912000-03-10 22:53:23 +00006540 if (result < 0) {
6541 PyErr_SetString(PyExc_ValueError, "substring not found");
6542 return NULL;
6543 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00006544 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006545}
6546
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006547PyDoc_STRVAR(rjust__doc__,
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006548"S.rjust(width[, fillchar]) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006549\n\
6550Return S right justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006551done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006552
6553static PyObject *
6554unicode_rjust(PyUnicodeObject *self, PyObject *args)
6555{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006556 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006557 Py_UNICODE fillchar = ' ';
6558
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006559 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006560 return NULL;
6561
Tim Peters7a29bd52001-09-12 03:03:31 +00006562 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006563 Py_INCREF(self);
6564 return (PyObject*) self;
6565 }
6566
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006567 return (PyObject*) pad(self, width - self->length, 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006568}
6569
Guido van Rossumd57fd912000-03-10 22:53:23 +00006570static PyObject*
Martin v. Löwis18e16552006-02-15 17:27:45 +00006571unicode_slice(PyUnicodeObject *self, Py_ssize_t start, Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006572{
6573 /* standard clamping */
6574 if (start < 0)
6575 start = 0;
6576 if (end < 0)
6577 end = 0;
6578 if (end > self->length)
6579 end = self->length;
Tim Peters7a29bd52001-09-12 03:03:31 +00006580 if (start == 0 && end == self->length && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006581 /* full slice, return original string */
6582 Py_INCREF(self);
6583 return (PyObject*) self;
6584 }
6585 if (start > end)
6586 start = end;
6587 /* copy slice */
6588 return (PyObject*) PyUnicode_FromUnicode(self->str + start,
6589 end - start);
6590}
6591
6592PyObject *PyUnicode_Split(PyObject *s,
6593 PyObject *sep,
Martin v. Löwis18e16552006-02-15 17:27:45 +00006594 Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006595{
6596 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00006597
Guido van Rossumd57fd912000-03-10 22:53:23 +00006598 s = PyUnicode_FromObject(s);
6599 if (s == NULL)
6600 return NULL;
6601 if (sep != NULL) {
6602 sep = PyUnicode_FromObject(sep);
6603 if (sep == NULL) {
6604 Py_DECREF(s);
6605 return NULL;
6606 }
6607 }
6608
6609 result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
6610
6611 Py_DECREF(s);
6612 Py_XDECREF(sep);
6613 return result;
6614}
6615
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006616PyDoc_STRVAR(split__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006617"S.split([sep [,maxsplit]]) -> list of strings\n\
6618\n\
6619Return a list of the words in S, using sep as the\n\
6620delimiter string. If maxsplit is given, at most maxsplit\n\
Thomas Hellerca0d2cb2004-09-15 11:41:32 +00006621splits are done. If sep is not specified or is None,\n\
Walter Dörwald782afc52004-09-14 09:40:45 +00006622any whitespace string is a separator.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006623
6624static PyObject*
6625unicode_split(PyUnicodeObject *self, PyObject *args)
6626{
6627 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006628 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006629
Martin v. Löwis18e16552006-02-15 17:27:45 +00006630 if (!PyArg_ParseTuple(args, "|On:split", &substring, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006631 return NULL;
6632
6633 if (substring == Py_None)
6634 return split(self, NULL, maxcount);
6635 else if (PyUnicode_Check(substring))
6636 return split(self, (PyUnicodeObject *)substring, maxcount);
6637 else
6638 return PyUnicode_Split((PyObject *)self, substring, maxcount);
6639}
6640
Thomas Wouters477c8d52006-05-27 19:21:47 +00006641PyObject *
6642PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
6643{
6644 PyObject* str_obj;
6645 PyObject* sep_obj;
6646 PyObject* out;
6647
6648 str_obj = PyUnicode_FromObject(str_in);
6649 if (!str_obj)
6650 return NULL;
6651 sep_obj = PyUnicode_FromObject(sep_in);
6652 if (!sep_obj) {
6653 Py_DECREF(str_obj);
6654 return NULL;
6655 }
6656
6657 out = stringlib_partition(
6658 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
6659 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
6660 );
6661
6662 Py_DECREF(sep_obj);
6663 Py_DECREF(str_obj);
6664
6665 return out;
6666}
6667
6668
6669PyObject *
6670PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
6671{
6672 PyObject* str_obj;
6673 PyObject* sep_obj;
6674 PyObject* out;
6675
6676 str_obj = PyUnicode_FromObject(str_in);
6677 if (!str_obj)
6678 return NULL;
6679 sep_obj = PyUnicode_FromObject(sep_in);
6680 if (!sep_obj) {
6681 Py_DECREF(str_obj);
6682 return NULL;
6683 }
6684
6685 out = stringlib_rpartition(
6686 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
6687 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
6688 );
6689
6690 Py_DECREF(sep_obj);
6691 Py_DECREF(str_obj);
6692
6693 return out;
6694}
6695
6696PyDoc_STRVAR(partition__doc__,
6697"S.partition(sep) -> (head, sep, tail)\n\
6698\n\
6699Searches for the separator sep in S, and returns the part before it,\n\
6700the separator itself, and the part after it. If the separator is not\n\
6701found, returns S and two empty strings.");
6702
6703static PyObject*
6704unicode_partition(PyUnicodeObject *self, PyObject *separator)
6705{
6706 return PyUnicode_Partition((PyObject *)self, separator);
6707}
6708
6709PyDoc_STRVAR(rpartition__doc__,
Thomas Wouters89f507f2006-12-13 04:49:30 +00006710"S.rpartition(sep) -> (tail, sep, head)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00006711\n\
6712Searches for the separator sep in S, starting at the end of S, and returns\n\
6713the part before it, the separator itself, and the part after it. If the\n\
Thomas Wouters89f507f2006-12-13 04:49:30 +00006714separator is not found, returns two empty strings and S.");
Thomas Wouters477c8d52006-05-27 19:21:47 +00006715
6716static PyObject*
6717unicode_rpartition(PyUnicodeObject *self, PyObject *separator)
6718{
6719 return PyUnicode_RPartition((PyObject *)self, separator);
6720}
6721
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006722PyObject *PyUnicode_RSplit(PyObject *s,
6723 PyObject *sep,
Martin v. Löwis18e16552006-02-15 17:27:45 +00006724 Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006725{
6726 PyObject *result;
6727
6728 s = PyUnicode_FromObject(s);
6729 if (s == NULL)
6730 return NULL;
6731 if (sep != NULL) {
6732 sep = PyUnicode_FromObject(sep);
6733 if (sep == NULL) {
6734 Py_DECREF(s);
6735 return NULL;
6736 }
6737 }
6738
6739 result = rsplit((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
6740
6741 Py_DECREF(s);
6742 Py_XDECREF(sep);
6743 return result;
6744}
6745
6746PyDoc_STRVAR(rsplit__doc__,
6747"S.rsplit([sep [,maxsplit]]) -> list of strings\n\
6748\n\
6749Return a list of the words in S, using sep as the\n\
6750delimiter string, starting at the end of the string and\n\
6751working to the front. If maxsplit is given, at most maxsplit\n\
6752splits are done. If sep is not specified, any whitespace string\n\
6753is a separator.");
6754
6755static PyObject*
6756unicode_rsplit(PyUnicodeObject *self, PyObject *args)
6757{
6758 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006759 Py_ssize_t maxcount = -1;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006760
Martin v. Löwis18e16552006-02-15 17:27:45 +00006761 if (!PyArg_ParseTuple(args, "|On:rsplit", &substring, &maxcount))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006762 return NULL;
6763
6764 if (substring == Py_None)
6765 return rsplit(self, NULL, maxcount);
6766 else if (PyUnicode_Check(substring))
6767 return rsplit(self, (PyUnicodeObject *)substring, maxcount);
6768 else
6769 return PyUnicode_RSplit((PyObject *)self, substring, maxcount);
6770}
6771
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006772PyDoc_STRVAR(splitlines__doc__,
Guido van Rossum86662912000-04-11 15:38:46 +00006773"S.splitlines([keepends]]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006774\n\
6775Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +00006776Line breaks are not included in the resulting list unless keepends\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006777is given and true.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006778
6779static PyObject*
6780unicode_splitlines(PyUnicodeObject *self, PyObject *args)
6781{
Guido van Rossum86662912000-04-11 15:38:46 +00006782 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006783
Guido van Rossum86662912000-04-11 15:38:46 +00006784 if (!PyArg_ParseTuple(args, "|i:splitlines", &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006785 return NULL;
6786
Guido van Rossum86662912000-04-11 15:38:46 +00006787 return PyUnicode_Splitlines((PyObject *)self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006788}
6789
6790static
Guido van Rossumf15a29f2007-05-04 00:41:39 +00006791PyObject *unicode_str(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006792{
Guido van Rossumf15a29f2007-05-04 00:41:39 +00006793 PyObject *res = _PyUnicode_AsDefaultEncodedString(self, NULL);
6794 Py_XINCREF(res);
6795 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006796}
6797
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006798PyDoc_STRVAR(swapcase__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006799"S.swapcase() -> unicode\n\
6800\n\
6801Return a copy of S with uppercase characters converted to lowercase\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006802and vice versa.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006803
6804static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006805unicode_swapcase(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006806{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006807 return fixup(self, fixswapcase);
6808}
6809
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006810PyDoc_STRVAR(translate__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006811"S.translate(table) -> unicode\n\
6812\n\
6813Return a copy of the string S, where all characters have been mapped\n\
6814through the given translation table, which must be a mapping of\n\
Walter Dörwald5c1ee172002-09-04 20:31:32 +00006815Unicode ordinals to Unicode ordinals, Unicode strings or None.\n\
6816Unmapped characters are left untouched. Characters mapped to None\n\
6817are deleted.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006818
6819static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006820unicode_translate(PyUnicodeObject *self, PyObject *table)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006821{
Tim Petersced69f82003-09-16 20:30:58 +00006822 return PyUnicode_TranslateCharmap(self->str,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006823 self->length,
Tim Petersced69f82003-09-16 20:30:58 +00006824 table,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006825 "ignore");
6826}
6827
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006828PyDoc_STRVAR(upper__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006829"S.upper() -> unicode\n\
6830\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006831Return a copy of S converted to uppercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006832
6833static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006834unicode_upper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006835{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006836 return fixup(self, fixupper);
6837}
6838
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006839PyDoc_STRVAR(zfill__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006840"S.zfill(width) -> unicode\n\
6841\n\
6842Pad a numeric string x with zeros on the left, to fill a field\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006843of the specified width. The string x is never truncated.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006844
6845static PyObject *
6846unicode_zfill(PyUnicodeObject *self, PyObject *args)
6847{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006848 Py_ssize_t fill;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006849 PyUnicodeObject *u;
6850
Martin v. Löwis18e16552006-02-15 17:27:45 +00006851 Py_ssize_t width;
6852 if (!PyArg_ParseTuple(args, "n:zfill", &width))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006853 return NULL;
6854
6855 if (self->length >= width) {
Walter Dörwald0fe940c2002-04-15 18:42:15 +00006856 if (PyUnicode_CheckExact(self)) {
6857 Py_INCREF(self);
6858 return (PyObject*) self;
6859 }
6860 else
6861 return PyUnicode_FromUnicode(
6862 PyUnicode_AS_UNICODE(self),
6863 PyUnicode_GET_SIZE(self)
6864 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006865 }
6866
6867 fill = width - self->length;
6868
6869 u = pad(self, fill, 0, '0');
6870
Walter Dörwald068325e2002-04-15 13:36:47 +00006871 if (u == NULL)
6872 return NULL;
6873
Guido van Rossumd57fd912000-03-10 22:53:23 +00006874 if (u->str[fill] == '+' || u->str[fill] == '-') {
6875 /* move sign to beginning of string */
6876 u->str[0] = u->str[fill];
6877 u->str[fill] = '0';
6878 }
6879
6880 return (PyObject*) u;
6881}
Guido van Rossumd57fd912000-03-10 22:53:23 +00006882
6883#if 0
6884static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006885unicode_freelistsize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006886{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006887 return PyInt_FromLong(unicode_freelist_size);
6888}
6889#endif
6890
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006891PyDoc_STRVAR(startswith__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006892"S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006893\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00006894Return True if S starts with the specified prefix, False otherwise.\n\
6895With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006896With optional end, stop comparing S at that position.\n\
6897prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006898
6899static PyObject *
6900unicode_startswith(PyUnicodeObject *self,
6901 PyObject *args)
6902{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006903 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006904 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006905 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006906 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006907 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006908
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006909 if (!PyArg_ParseTuple(args, "O|O&O&:startswith", &subobj,
Guido van Rossumb8872e62000-05-09 14:14:27 +00006910 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006911 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006912 if (PyTuple_Check(subobj)) {
6913 Py_ssize_t i;
6914 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
6915 substring = (PyUnicodeObject *)PyUnicode_FromObject(
6916 PyTuple_GET_ITEM(subobj, i));
6917 if (substring == NULL)
6918 return NULL;
6919 result = tailmatch(self, substring, start, end, -1);
6920 Py_DECREF(substring);
6921 if (result) {
6922 Py_RETURN_TRUE;
6923 }
6924 }
6925 /* nothing matched */
6926 Py_RETURN_FALSE;
6927 }
6928 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006929 if (substring == NULL)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006930 return NULL;
6931 result = tailmatch(self, substring, start, end, -1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006932 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006933 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006934}
6935
6936
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006937PyDoc_STRVAR(endswith__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006938"S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006939\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00006940Return True if S ends with the specified suffix, False otherwise.\n\
6941With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006942With optional end, stop comparing S at that position.\n\
6943suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006944
6945static PyObject *
6946unicode_endswith(PyUnicodeObject *self,
6947 PyObject *args)
6948{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006949 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006950 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006951 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006952 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006953 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006954
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006955 if (!PyArg_ParseTuple(args, "O|O&O&:endswith", &subobj,
6956 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006957 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006958 if (PyTuple_Check(subobj)) {
6959 Py_ssize_t i;
6960 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
6961 substring = (PyUnicodeObject *)PyUnicode_FromObject(
6962 PyTuple_GET_ITEM(subobj, i));
6963 if (substring == NULL)
6964 return NULL;
6965 result = tailmatch(self, substring, start, end, +1);
6966 Py_DECREF(substring);
6967 if (result) {
6968 Py_RETURN_TRUE;
6969 }
6970 }
6971 Py_RETURN_FALSE;
6972 }
6973 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006974 if (substring == NULL)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006975 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006976
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006977 result = tailmatch(self, substring, start, end, +1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006978 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006979 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006980}
6981
6982
Guido van Rossum5d9113d2003-01-29 17:58:45 +00006983
6984static PyObject *
6985unicode_getnewargs(PyUnicodeObject *v)
6986{
6987 return Py_BuildValue("(u#)", v->str, v->length);
6988}
6989
6990
Guido van Rossumd57fd912000-03-10 22:53:23 +00006991static PyMethodDef unicode_methods[] = {
6992
6993 /* Order is according to common usage: often used methods should
6994 appear first, since lookup is done sequentially. */
6995
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006996 {"encode", (PyCFunction) unicode_encode, METH_VARARGS, encode__doc__},
6997 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
6998 {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006999 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007000 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
7001 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
7002 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
7003 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
7004 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
7005 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
7006 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +00007007 {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007008 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
7009 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
7010 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007011 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00007012 {"decode", (PyCFunction) unicode_decode, METH_VARARGS, decode__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007013/* {"maketrans", (PyCFunction) unicode_maketrans, METH_VARARGS, maketrans__doc__}, */
7014 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
7015 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
7016 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007017 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +00007018 {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007019 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS, splitlines__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007020 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007021 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
7022 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
7023 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
7024 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
7025 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
7026 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
7027 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
7028 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
7029 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
7030 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
7031 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
7032 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
7033 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
7034 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007035 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
Walter Dörwald068325e2002-04-15 13:36:47 +00007036#if 0
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007037 {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
Guido van Rossumd57fd912000-03-10 22:53:23 +00007038#endif
7039
7040#if 0
7041 /* This one is just used for debugging the implementation. */
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007042 {"freelistsize", (PyCFunction) unicode_freelistsize, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00007043#endif
7044
Guido van Rossum5d9113d2003-01-29 17:58:45 +00007045 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00007046 {NULL, NULL}
7047};
7048
Neil Schemenauerce30bc92002-11-18 16:10:18 +00007049static PyObject *
7050unicode_mod(PyObject *v, PyObject *w)
7051{
7052 if (!PyUnicode_Check(v)) {
7053 Py_INCREF(Py_NotImplemented);
7054 return Py_NotImplemented;
7055 }
7056 return PyUnicode_Format(v, w);
7057}
7058
7059static PyNumberMethods unicode_as_number = {
7060 0, /*nb_add*/
7061 0, /*nb_subtract*/
7062 0, /*nb_multiply*/
Neil Schemenauerce30bc92002-11-18 16:10:18 +00007063 unicode_mod, /*nb_remainder*/
7064};
7065
Guido van Rossumd57fd912000-03-10 22:53:23 +00007066static PySequenceMethods unicode_as_sequence = {
Martin v. Löwis18e16552006-02-15 17:27:45 +00007067 (lenfunc) unicode_length, /* sq_length */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007068 PyUnicode_Concat, /* sq_concat */
Martin v. Löwis18e16552006-02-15 17:27:45 +00007069 (ssizeargfunc) unicode_repeat, /* sq_repeat */
7070 (ssizeargfunc) unicode_getitem, /* sq_item */
7071 (ssizessizeargfunc) unicode_slice, /* sq_slice */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007072 0, /* sq_ass_item */
7073 0, /* sq_ass_slice */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007074 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007075};
7076
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007077static PyObject*
7078unicode_subscript(PyUnicodeObject* self, PyObject* item)
7079{
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00007080 if (PyIndex_Check(item)) {
7081 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007082 if (i == -1 && PyErr_Occurred())
7083 return NULL;
7084 if (i < 0)
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007085 i += PyUnicode_GET_SIZE(self);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007086 return unicode_getitem(self, i);
7087 } else if (PySlice_Check(item)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00007088 Py_ssize_t start, stop, step, slicelength, cur, i;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007089 Py_UNICODE* source_buf;
7090 Py_UNICODE* result_buf;
7091 PyObject* result;
7092
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007093 if (PySlice_GetIndicesEx((PySliceObject*)item, PyUnicode_GET_SIZE(self),
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007094 &start, &stop, &step, &slicelength) < 0) {
7095 return NULL;
7096 }
7097
7098 if (slicelength <= 0) {
7099 return PyUnicode_FromUnicode(NULL, 0);
7100 } else {
7101 source_buf = PyUnicode_AS_UNICODE((PyObject*)self);
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007102 result_buf = (Py_UNICODE *)PyMem_MALLOC(slicelength*
7103 sizeof(Py_UNICODE));
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007104
7105 if (result_buf == NULL)
7106 return PyErr_NoMemory();
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007107
7108 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
7109 result_buf[i] = source_buf[cur];
7110 }
Tim Petersced69f82003-09-16 20:30:58 +00007111
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007112 result = PyUnicode_FromUnicode(result_buf, slicelength);
7113 PyMem_FREE(result_buf);
7114 return result;
7115 }
7116 } else {
7117 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
7118 return NULL;
7119 }
7120}
7121
7122static PyMappingMethods unicode_as_mapping = {
Martin v. Löwis18e16552006-02-15 17:27:45 +00007123 (lenfunc)unicode_length, /* mp_length */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007124 (binaryfunc)unicode_subscript, /* mp_subscript */
7125 (objobjargproc)0, /* mp_ass_subscript */
7126};
7127
Martin v. Löwis18e16552006-02-15 17:27:45 +00007128static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +00007129unicode_buffer_getreadbuf(PyUnicodeObject *self,
Martin v. Löwis18e16552006-02-15 17:27:45 +00007130 Py_ssize_t index,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007131 const void **ptr)
7132{
7133 if (index != 0) {
7134 PyErr_SetString(PyExc_SystemError,
7135 "accessing non-existent unicode segment");
7136 return -1;
7137 }
7138 *ptr = (void *) self->str;
7139 return PyUnicode_GET_DATA_SIZE(self);
7140}
7141
Martin v. Löwis18e16552006-02-15 17:27:45 +00007142static Py_ssize_t
7143unicode_buffer_getwritebuf(PyUnicodeObject *self, Py_ssize_t index,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007144 const void **ptr)
7145{
7146 PyErr_SetString(PyExc_TypeError,
Neal Norwitz20e72132002-06-13 21:25:17 +00007147 "cannot use unicode as modifiable buffer");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007148 return -1;
7149}
7150
7151static int
7152unicode_buffer_getsegcount(PyUnicodeObject *self,
Martin v. Löwis18e16552006-02-15 17:27:45 +00007153 Py_ssize_t *lenp)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007154{
7155 if (lenp)
7156 *lenp = PyUnicode_GET_DATA_SIZE(self);
7157 return 1;
7158}
7159
Martin v. Löwiseb079f12006-02-16 14:32:27 +00007160static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +00007161unicode_buffer_getcharbuf(PyUnicodeObject *self,
Martin v. Löwis18e16552006-02-15 17:27:45 +00007162 Py_ssize_t index,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007163 const void **ptr)
7164{
7165 PyObject *str;
Tim Petersced69f82003-09-16 20:30:58 +00007166
Guido van Rossumd57fd912000-03-10 22:53:23 +00007167 if (index != 0) {
7168 PyErr_SetString(PyExc_SystemError,
7169 "accessing non-existent unicode segment");
7170 return -1;
7171 }
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00007172 str = _PyUnicode_AsDefaultEncodedString((PyObject *)self, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007173 if (str == NULL)
7174 return -1;
7175 *ptr = (void *) PyString_AS_STRING(str);
7176 return PyString_GET_SIZE(str);
7177}
7178
7179/* Helpers for PyUnicode_Format() */
7180
7181static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +00007182getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007183{
Martin v. Löwis18e16552006-02-15 17:27:45 +00007184 Py_ssize_t argidx = *p_argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007185 if (argidx < arglen) {
7186 (*p_argidx)++;
7187 if (arglen < 0)
7188 return args;
7189 else
7190 return PyTuple_GetItem(args, argidx);
7191 }
7192 PyErr_SetString(PyExc_TypeError,
7193 "not enough arguments for format string");
7194 return NULL;
7195}
7196
7197#define F_LJUST (1<<0)
7198#define F_SIGN (1<<1)
7199#define F_BLANK (1<<2)
7200#define F_ALT (1<<3)
7201#define F_ZERO (1<<4)
7202
Martin v. Löwis18e16552006-02-15 17:27:45 +00007203static Py_ssize_t
Neal Norwitzfc76d632006-01-10 06:03:13 +00007204strtounicode(Py_UNICODE *buffer, const char *charbuffer)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007205{
Martin v. Löwis18e16552006-02-15 17:27:45 +00007206 register Py_ssize_t i;
7207 Py_ssize_t len = strlen(charbuffer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007208 for (i = len - 1; i >= 0; i--)
7209 buffer[i] = (Py_UNICODE) charbuffer[i];
7210
Guido van Rossumd57fd912000-03-10 22:53:23 +00007211 return len;
7212}
7213
Neal Norwitzfc76d632006-01-10 06:03:13 +00007214static int
7215doubletounicode(Py_UNICODE *buffer, size_t len, const char *format, double x)
7216{
Tim Peters15231542006-02-16 01:08:01 +00007217 Py_ssize_t result;
7218
Neal Norwitzfc76d632006-01-10 06:03:13 +00007219 PyOS_ascii_formatd((char *)buffer, len, format, x);
Tim Peters15231542006-02-16 01:08:01 +00007220 result = strtounicode(buffer, (char *)buffer);
7221 return Py_SAFE_DOWNCAST(result, Py_ssize_t, int);
Neal Norwitzfc76d632006-01-10 06:03:13 +00007222}
7223
7224static int
7225longtounicode(Py_UNICODE *buffer, size_t len, const char *format, long x)
7226{
Tim Peters15231542006-02-16 01:08:01 +00007227 Py_ssize_t result;
7228
Neal Norwitzfc76d632006-01-10 06:03:13 +00007229 PyOS_snprintf((char *)buffer, len, format, x);
Tim Peters15231542006-02-16 01:08:01 +00007230 result = strtounicode(buffer, (char *)buffer);
7231 return Py_SAFE_DOWNCAST(result, Py_ssize_t, int);
Neal Norwitzfc76d632006-01-10 06:03:13 +00007232}
7233
Guido van Rossum078151d2002-08-11 04:24:12 +00007234/* XXX To save some code duplication, formatfloat/long/int could have been
7235 shared with stringobject.c, converting from 8-bit to Unicode after the
7236 formatting is done. */
7237
Guido van Rossumd57fd912000-03-10 22:53:23 +00007238static int
7239formatfloat(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007240 size_t buflen,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007241 int flags,
7242 int prec,
7243 int type,
7244 PyObject *v)
7245{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007246 /* fmt = '%#.' + `prec` + `type`
7247 worst case length = 3 + 10 (len of INT_MAX) + 1 = 14 (use 20)*/
Guido van Rossumd57fd912000-03-10 22:53:23 +00007248 char fmt[20];
7249 double x;
Tim Petersced69f82003-09-16 20:30:58 +00007250
Guido van Rossumd57fd912000-03-10 22:53:23 +00007251 x = PyFloat_AsDouble(v);
7252 if (x == -1.0 && PyErr_Occurred())
7253 return -1;
7254 if (prec < 0)
7255 prec = 6;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007256 if (type == 'f' && (fabs(x) / 1e25) >= 1e25)
7257 type = 'g';
Marc-André Lemburg79f57832002-12-29 19:44:06 +00007258 /* Worst case length calc to ensure no buffer overrun:
7259
7260 'g' formats:
7261 fmt = %#.<prec>g
7262 buf = '-' + [0-9]*prec + '.' + 'e+' + (longest exp
7263 for any double rep.)
7264 len = 1 + prec + 1 + 2 + 5 = 9 + prec
7265
7266 'f' formats:
7267 buf = '-' + [0-9]*x + '.' + [0-9]*prec (with x < 50)
7268 len = 1 + 50 + 1 + prec = 52 + prec
7269
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007270 If prec=0 the effective precision is 1 (the leading digit is
Tim Petersced69f82003-09-16 20:30:58 +00007271 always given), therefore increase the length by one.
Marc-André Lemburg79f57832002-12-29 19:44:06 +00007272
7273 */
7274 if ((type == 'g' && buflen <= (size_t)10 + (size_t)prec) ||
7275 (type == 'f' && buflen <= (size_t)53 + (size_t)prec)) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007276 PyErr_SetString(PyExc_OverflowError,
Marc-André Lemburg79f57832002-12-29 19:44:06 +00007277 "formatted float is too long (precision too large?)");
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007278 return -1;
7279 }
Marc-André Lemburg79f57832002-12-29 19:44:06 +00007280 PyOS_snprintf(fmt, sizeof(fmt), "%%%s.%d%c",
7281 (flags&F_ALT) ? "#" : "",
7282 prec, type);
Neal Norwitzfc76d632006-01-10 06:03:13 +00007283 return doubletounicode(buf, buflen, fmt, x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007284}
7285
Tim Peters38fd5b62000-09-21 05:43:11 +00007286static PyObject*
7287formatlong(PyObject *val, int flags, int prec, int type)
7288{
7289 char *buf;
7290 int i, len;
7291 PyObject *str; /* temporary string object. */
7292 PyUnicodeObject *result;
7293
7294 str = _PyString_FormatLong(val, flags, prec, type, &buf, &len);
7295 if (!str)
7296 return NULL;
7297 result = _PyUnicode_New(len);
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00007298 if (!result) {
7299 Py_DECREF(str);
7300 return NULL;
7301 }
Tim Peters38fd5b62000-09-21 05:43:11 +00007302 for (i = 0; i < len; i++)
7303 result->str[i] = buf[i];
7304 result->str[len] = 0;
7305 Py_DECREF(str);
7306 return (PyObject*)result;
7307}
7308
Guido van Rossumd57fd912000-03-10 22:53:23 +00007309static int
7310formatint(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007311 size_t buflen,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007312 int flags,
7313 int prec,
7314 int type,
7315 PyObject *v)
7316{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007317 /* fmt = '%#.' + `prec` + 'l' + `type`
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00007318 * worst case length = 3 + 19 (worst len of INT_MAX on 64-bit machine)
7319 * + 1 + 1
7320 * = 24
7321 */
Tim Peters38fd5b62000-09-21 05:43:11 +00007322 char fmt[64]; /* plenty big enough! */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00007323 char *sign;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007324 long x;
7325
7326 x = PyInt_AsLong(v);
7327 if (x == -1 && PyErr_Occurred())
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00007328 return -1;
Guido van Rossum6c9e1302003-11-29 23:52:13 +00007329 if (x < 0 && type == 'u') {
7330 type = 'd';
Guido van Rossum078151d2002-08-11 04:24:12 +00007331 }
Guido van Rossum6c9e1302003-11-29 23:52:13 +00007332 if (x < 0 && (type == 'x' || type == 'X' || type == 'o'))
7333 sign = "-";
7334 else
7335 sign = "";
Guido van Rossumd57fd912000-03-10 22:53:23 +00007336 if (prec < 0)
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00007337 prec = 1;
7338
Guido van Rossum6c9e1302003-11-29 23:52:13 +00007339 /* buf = '+'/'-'/'' + '0'/'0x'/'' + '[0-9]'*max(prec, len(x in octal))
7340 * worst case buf = '-0x' + [0-9]*prec, where prec >= 11
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00007341 */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00007342 if (buflen <= 14 || buflen <= (size_t)3 + (size_t)prec) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007343 PyErr_SetString(PyExc_OverflowError,
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00007344 "formatted integer is too long (precision too large?)");
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007345 return -1;
7346 }
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00007347
7348 if ((flags & F_ALT) &&
7349 (type == 'x' || type == 'X')) {
Tim Petersced69f82003-09-16 20:30:58 +00007350 /* When converting under %#x or %#X, there are a number
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00007351 * of issues that cause pain:
7352 * - when 0 is being converted, the C standard leaves off
7353 * the '0x' or '0X', which is inconsistent with other
7354 * %#x/%#X conversions and inconsistent with Python's
7355 * hex() function
7356 * - there are platforms that violate the standard and
7357 * convert 0 with the '0x' or '0X'
7358 * (Metrowerks, Compaq Tru64)
7359 * - there are platforms that give '0x' when converting
Tim Petersced69f82003-09-16 20:30:58 +00007360 * under %#X, but convert 0 in accordance with the
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00007361 * standard (OS/2 EMX)
Tim Petersced69f82003-09-16 20:30:58 +00007362 *
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00007363 * We can achieve the desired consistency by inserting our
7364 * own '0x' or '0X' prefix, and substituting %x/%X in place
7365 * of %#x/%#X.
7366 *
7367 * Note that this is the same approach as used in
7368 * formatint() in stringobject.c
Andrew MacIntyrec4874392002-02-26 11:36:35 +00007369 */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00007370 PyOS_snprintf(fmt, sizeof(fmt), "%s0%c%%.%dl%c",
7371 sign, type, prec, type);
Andrew MacIntyrec4874392002-02-26 11:36:35 +00007372 }
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00007373 else {
Guido van Rossum6c9e1302003-11-29 23:52:13 +00007374 PyOS_snprintf(fmt, sizeof(fmt), "%s%%%s.%dl%c",
7375 sign, (flags&F_ALT) ? "#" : "",
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00007376 prec, type);
Tim Petersb3d8d1f2001-04-28 05:38:26 +00007377 }
Guido van Rossum6c9e1302003-11-29 23:52:13 +00007378 if (sign[0])
Neal Norwitzfc76d632006-01-10 06:03:13 +00007379 return longtounicode(buf, buflen, fmt, -x);
Guido van Rossum6c9e1302003-11-29 23:52:13 +00007380 else
Neal Norwitzfc76d632006-01-10 06:03:13 +00007381 return longtounicode(buf, buflen, fmt, x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007382}
7383
7384static int
7385formatchar(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007386 size_t buflen,
7387 PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007388{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007389 /* presume that the buffer is at least 2 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00007390 if (PyUnicode_Check(v)) {
7391 if (PyUnicode_GET_SIZE(v) != 1)
7392 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007393 buf[0] = PyUnicode_AS_UNICODE(v)[0];
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00007394 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007395
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00007396 else if (PyString_Check(v)) {
Tim Petersced69f82003-09-16 20:30:58 +00007397 if (PyString_GET_SIZE(v) != 1)
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00007398 goto onError;
7399 buf[0] = (Py_UNICODE)PyString_AS_STRING(v)[0];
7400 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007401
7402 else {
7403 /* Integer input truncated to a character */
7404 long x;
7405 x = PyInt_AsLong(v);
7406 if (x == -1 && PyErr_Occurred())
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00007407 goto onError;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00007408#ifdef Py_UNICODE_WIDE
7409 if (x < 0 || x > 0x10ffff) {
Walter Dörwald44f527f2003-04-02 16:37:24 +00007410 PyErr_SetString(PyExc_OverflowError,
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00007411 "%c arg not in range(0x110000) "
7412 "(wide Python build)");
7413 return -1;
7414 }
7415#else
7416 if (x < 0 || x > 0xffff) {
Walter Dörwald44f527f2003-04-02 16:37:24 +00007417 PyErr_SetString(PyExc_OverflowError,
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00007418 "%c arg not in range(0x10000) "
7419 "(narrow Python build)");
7420 return -1;
7421 }
7422#endif
7423 buf[0] = (Py_UNICODE) x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007424 }
7425 buf[1] = '\0';
7426 return 1;
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00007427
7428 onError:
7429 PyErr_SetString(PyExc_TypeError,
7430 "%c requires int or char");
7431 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007432}
7433
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007434/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
7435
7436 FORMATBUFLEN is the length of the buffer in which the floats, ints, &
7437 chars are formatted. XXX This is a magic number. Each formatting
7438 routine does bounds checking to ensure no overflow, but a better
7439 solution may be to malloc a buffer of appropriate size for each
7440 format. For now, the current solution is sufficient.
7441*/
7442#define FORMATBUFLEN (size_t)120
7443
Guido van Rossumd57fd912000-03-10 22:53:23 +00007444PyObject *PyUnicode_Format(PyObject *format,
7445 PyObject *args)
7446{
7447 Py_UNICODE *fmt, *res;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007448 Py_ssize_t fmtcnt, rescnt, reslen, arglen, argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007449 int args_owned = 0;
7450 PyUnicodeObject *result = NULL;
7451 PyObject *dict = NULL;
7452 PyObject *uformat;
Tim Petersced69f82003-09-16 20:30:58 +00007453
Guido van Rossumd57fd912000-03-10 22:53:23 +00007454 if (format == NULL || args == NULL) {
7455 PyErr_BadInternalCall();
7456 return NULL;
7457 }
7458 uformat = PyUnicode_FromObject(format);
Fred Drakee4315f52000-05-09 19:53:39 +00007459 if (uformat == NULL)
7460 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007461 fmt = PyUnicode_AS_UNICODE(uformat);
7462 fmtcnt = PyUnicode_GET_SIZE(uformat);
7463
7464 reslen = rescnt = fmtcnt + 100;
7465 result = _PyUnicode_New(reslen);
7466 if (result == NULL)
7467 goto onError;
7468 res = PyUnicode_AS_UNICODE(result);
7469
7470 if (PyTuple_Check(args)) {
7471 arglen = PyTuple_Size(args);
7472 argidx = 0;
7473 }
7474 else {
7475 arglen = -1;
7476 argidx = -2;
7477 }
Neal Norwitz80a1bf42002-11-12 23:01:12 +00007478 if (args->ob_type->tp_as_mapping && !PyTuple_Check(args) &&
7479 !PyObject_TypeCheck(args, &PyBaseString_Type))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007480 dict = args;
7481
7482 while (--fmtcnt >= 0) {
7483 if (*fmt != '%') {
7484 if (--rescnt < 0) {
7485 rescnt = fmtcnt + 100;
7486 reslen += rescnt;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00007487 if (_PyUnicode_Resize(&result, reslen) < 0)
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00007488 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007489 res = PyUnicode_AS_UNICODE(result) + reslen - rescnt;
7490 --rescnt;
7491 }
7492 *res++ = *fmt++;
7493 }
7494 else {
7495 /* Got a format specifier */
7496 int flags = 0;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007497 Py_ssize_t width = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007498 int prec = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007499 Py_UNICODE c = '\0';
7500 Py_UNICODE fill;
7501 PyObject *v = NULL;
7502 PyObject *temp = NULL;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007503 Py_UNICODE *pbuf;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007504 Py_UNICODE sign;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007505 Py_ssize_t len;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007506 Py_UNICODE formatbuf[FORMATBUFLEN]; /* For format{float,int,char}() */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007507
7508 fmt++;
7509 if (*fmt == '(') {
7510 Py_UNICODE *keystart;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007511 Py_ssize_t keylen;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007512 PyObject *key;
7513 int pcount = 1;
7514
7515 if (dict == NULL) {
7516 PyErr_SetString(PyExc_TypeError,
Tim Petersced69f82003-09-16 20:30:58 +00007517 "format requires a mapping");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007518 goto onError;
7519 }
7520 ++fmt;
7521 --fmtcnt;
7522 keystart = fmt;
7523 /* Skip over balanced parentheses */
7524 while (pcount > 0 && --fmtcnt >= 0) {
7525 if (*fmt == ')')
7526 --pcount;
7527 else if (*fmt == '(')
7528 ++pcount;
7529 fmt++;
7530 }
7531 keylen = fmt - keystart - 1;
7532 if (fmtcnt < 0 || pcount > 0) {
7533 PyErr_SetString(PyExc_ValueError,
7534 "incomplete format key");
7535 goto onError;
7536 }
Marc-André Lemburg72f82132001-11-20 15:18:49 +00007537#if 0
Fred Drakee4315f52000-05-09 19:53:39 +00007538 /* keys are converted to strings using UTF-8 and
Guido van Rossumd57fd912000-03-10 22:53:23 +00007539 then looked up since Python uses strings to hold
7540 variables names etc. in its namespaces and we
Fred Drakee4315f52000-05-09 19:53:39 +00007541 wouldn't want to break common idioms. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007542 key = PyUnicode_EncodeUTF8(keystart,
7543 keylen,
7544 NULL);
Marc-André Lemburg72f82132001-11-20 15:18:49 +00007545#else
7546 key = PyUnicode_FromUnicode(keystart, keylen);
7547#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00007548 if (key == NULL)
7549 goto onError;
7550 if (args_owned) {
7551 Py_DECREF(args);
7552 args_owned = 0;
7553 }
7554 args = PyObject_GetItem(dict, key);
7555 Py_DECREF(key);
7556 if (args == NULL) {
7557 goto onError;
7558 }
7559 args_owned = 1;
7560 arglen = -1;
7561 argidx = -2;
7562 }
7563 while (--fmtcnt >= 0) {
7564 switch (c = *fmt++) {
7565 case '-': flags |= F_LJUST; continue;
7566 case '+': flags |= F_SIGN; continue;
7567 case ' ': flags |= F_BLANK; continue;
7568 case '#': flags |= F_ALT; continue;
7569 case '0': flags |= F_ZERO; continue;
7570 }
7571 break;
7572 }
7573 if (c == '*') {
7574 v = getnextarg(args, arglen, &argidx);
7575 if (v == NULL)
7576 goto onError;
7577 if (!PyInt_Check(v)) {
7578 PyErr_SetString(PyExc_TypeError,
7579 "* wants int");
7580 goto onError;
7581 }
7582 width = PyInt_AsLong(v);
Guido van Rossumddefaf32007-01-14 03:31:43 +00007583 if (width == -1 && PyErr_Occurred())
7584 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007585 if (width < 0) {
7586 flags |= F_LJUST;
7587 width = -width;
7588 }
7589 if (--fmtcnt >= 0)
7590 c = *fmt++;
7591 }
7592 else if (c >= '0' && c <= '9') {
7593 width = c - '0';
7594 while (--fmtcnt >= 0) {
7595 c = *fmt++;
7596 if (c < '0' || c > '9')
7597 break;
7598 if ((width*10) / 10 != width) {
7599 PyErr_SetString(PyExc_ValueError,
7600 "width too big");
7601 goto onError;
7602 }
7603 width = width*10 + (c - '0');
7604 }
7605 }
7606 if (c == '.') {
7607 prec = 0;
7608 if (--fmtcnt >= 0)
7609 c = *fmt++;
7610 if (c == '*') {
7611 v = getnextarg(args, arglen, &argidx);
7612 if (v == NULL)
7613 goto onError;
7614 if (!PyInt_Check(v)) {
7615 PyErr_SetString(PyExc_TypeError,
7616 "* wants int");
7617 goto onError;
7618 }
7619 prec = PyInt_AsLong(v);
Guido van Rossumddefaf32007-01-14 03:31:43 +00007620 if (prec == -1 && PyErr_Occurred())
7621 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007622 if (prec < 0)
7623 prec = 0;
7624 if (--fmtcnt >= 0)
7625 c = *fmt++;
7626 }
7627 else if (c >= '0' && c <= '9') {
7628 prec = c - '0';
7629 while (--fmtcnt >= 0) {
7630 c = Py_CHARMASK(*fmt++);
7631 if (c < '0' || c > '9')
7632 break;
7633 if ((prec*10) / 10 != prec) {
7634 PyErr_SetString(PyExc_ValueError,
7635 "prec too big");
7636 goto onError;
7637 }
7638 prec = prec*10 + (c - '0');
7639 }
7640 }
7641 } /* prec */
7642 if (fmtcnt >= 0) {
7643 if (c == 'h' || c == 'l' || c == 'L') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007644 if (--fmtcnt >= 0)
7645 c = *fmt++;
7646 }
7647 }
7648 if (fmtcnt < 0) {
7649 PyErr_SetString(PyExc_ValueError,
7650 "incomplete format");
7651 goto onError;
7652 }
7653 if (c != '%') {
7654 v = getnextarg(args, arglen, &argidx);
7655 if (v == NULL)
7656 goto onError;
7657 }
7658 sign = 0;
7659 fill = ' ';
7660 switch (c) {
7661
7662 case '%':
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007663 pbuf = formatbuf;
7664 /* presume that buffer length is at least 1 */
7665 pbuf[0] = '%';
Guido van Rossumd57fd912000-03-10 22:53:23 +00007666 len = 1;
7667 break;
7668
7669 case 's':
7670 case 'r':
7671 if (PyUnicode_Check(v) && c == 's') {
7672 temp = v;
7673 Py_INCREF(temp);
7674 }
7675 else {
7676 PyObject *unicode;
7677 if (c == 's')
Marc-André Lemburgd25c6502004-07-23 16:13:25 +00007678 temp = PyObject_Unicode(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007679 else
7680 temp = PyObject_Repr(v);
7681 if (temp == NULL)
7682 goto onError;
Marc-André Lemburgd25c6502004-07-23 16:13:25 +00007683 if (PyUnicode_Check(temp))
7684 /* nothing to do */;
7685 else if (PyString_Check(temp)) {
7686 /* convert to string to Unicode */
Thomas Woutersa96affe2006-03-12 00:29:36 +00007687 unicode = PyUnicode_Decode(PyString_AS_STRING(temp),
Guido van Rossumd57fd912000-03-10 22:53:23 +00007688 PyString_GET_SIZE(temp),
Thomas Woutersa96affe2006-03-12 00:29:36 +00007689 NULL,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007690 "strict");
Thomas Woutersa96affe2006-03-12 00:29:36 +00007691 Py_DECREF(temp);
7692 temp = unicode;
7693 if (temp == NULL)
7694 goto onError;
7695 }
Marc-André Lemburgd25c6502004-07-23 16:13:25 +00007696 else {
7697 Py_DECREF(temp);
7698 PyErr_SetString(PyExc_TypeError,
7699 "%s argument has non-string str()");
7700 goto onError;
7701 }
7702 }
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007703 pbuf = PyUnicode_AS_UNICODE(temp);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007704 len = PyUnicode_GET_SIZE(temp);
7705 if (prec >= 0 && len > prec)
7706 len = prec;
7707 break;
7708
7709 case 'i':
7710 case 'd':
7711 case 'u':
7712 case 'o':
7713 case 'x':
7714 case 'X':
7715 if (c == 'i')
7716 c = 'd';
Tim Petersa3a3a032000-11-30 05:22:44 +00007717 if (PyLong_Check(v)) {
Tim Peters38fd5b62000-09-21 05:43:11 +00007718 temp = formatlong(v, flags, prec, c);
7719 if (!temp)
7720 goto onError;
7721 pbuf = PyUnicode_AS_UNICODE(temp);
7722 len = PyUnicode_GET_SIZE(temp);
Tim Peters38fd5b62000-09-21 05:43:11 +00007723 sign = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007724 }
Tim Peters38fd5b62000-09-21 05:43:11 +00007725 else {
7726 pbuf = formatbuf;
7727 len = formatint(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
7728 flags, prec, c, v);
7729 if (len < 0)
7730 goto onError;
Guido van Rossum6c9e1302003-11-29 23:52:13 +00007731 sign = 1;
Tim Peters38fd5b62000-09-21 05:43:11 +00007732 }
7733 if (flags & F_ZERO)
7734 fill = '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00007735 break;
7736
7737 case 'e':
7738 case 'E':
7739 case 'f':
Raymond Hettinger9bfe5332003-08-27 04:55:52 +00007740 case 'F':
Guido van Rossumd57fd912000-03-10 22:53:23 +00007741 case 'g':
7742 case 'G':
Raymond Hettinger9bfe5332003-08-27 04:55:52 +00007743 if (c == 'F')
7744 c = 'f';
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007745 pbuf = formatbuf;
7746 len = formatfloat(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
7747 flags, prec, c, v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007748 if (len < 0)
7749 goto onError;
7750 sign = 1;
Tim Peters38fd5b62000-09-21 05:43:11 +00007751 if (flags & F_ZERO)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007752 fill = '0';
7753 break;
7754
7755 case 'c':
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007756 pbuf = formatbuf;
7757 len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007758 if (len < 0)
7759 goto onError;
7760 break;
7761
7762 default:
7763 PyErr_Format(PyExc_ValueError,
Andrew M. Kuchling6ca89172000-12-15 13:07:46 +00007764 "unsupported format character '%c' (0x%x) "
Thomas Wouters89f507f2006-12-13 04:49:30 +00007765 "at index %zd",
Tim Petersced69f82003-09-16 20:30:58 +00007766 (31<=c && c<=126) ? (char)c : '?',
Marc-André Lemburg24e53b62002-09-24 09:32:14 +00007767 (int)c,
Thomas Wouters89f507f2006-12-13 04:49:30 +00007768 (Py_ssize_t)(fmt - 1 -
7769 PyUnicode_AS_UNICODE(uformat)));
Guido van Rossumd57fd912000-03-10 22:53:23 +00007770 goto onError;
7771 }
7772 if (sign) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007773 if (*pbuf == '-' || *pbuf == '+') {
7774 sign = *pbuf++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007775 len--;
7776 }
7777 else if (flags & F_SIGN)
7778 sign = '+';
7779 else if (flags & F_BLANK)
7780 sign = ' ';
7781 else
7782 sign = 0;
7783 }
7784 if (width < len)
7785 width = len;
Guido van Rossum049cd6b2002-10-11 00:43:48 +00007786 if (rescnt - (sign != 0) < width) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007787 reslen -= rescnt;
7788 rescnt = width + fmtcnt + 100;
7789 reslen += rescnt;
Guido van Rossum049cd6b2002-10-11 00:43:48 +00007790 if (reslen < 0) {
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00007791 Py_XDECREF(temp);
Thomas Woutersa96affe2006-03-12 00:29:36 +00007792 PyErr_NoMemory();
7793 goto onError;
Guido van Rossum049cd6b2002-10-11 00:43:48 +00007794 }
Thomas Woutersa96affe2006-03-12 00:29:36 +00007795 if (_PyUnicode_Resize(&result, reslen) < 0) {
7796 Py_XDECREF(temp);
7797 goto onError;
7798 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007799 res = PyUnicode_AS_UNICODE(result)
7800 + reslen - rescnt;
7801 }
7802 if (sign) {
7803 if (fill != ' ')
7804 *res++ = sign;
7805 rescnt--;
7806 if (width > len)
7807 width--;
7808 }
Tim Peters38fd5b62000-09-21 05:43:11 +00007809 if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
7810 assert(pbuf[0] == '0');
Tim Petersfff53252001-04-12 18:38:48 +00007811 assert(pbuf[1] == c);
7812 if (fill != ' ') {
7813 *res++ = *pbuf++;
7814 *res++ = *pbuf++;
Tim Peters38fd5b62000-09-21 05:43:11 +00007815 }
Tim Petersfff53252001-04-12 18:38:48 +00007816 rescnt -= 2;
7817 width -= 2;
7818 if (width < 0)
7819 width = 0;
7820 len -= 2;
Tim Peters38fd5b62000-09-21 05:43:11 +00007821 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007822 if (width > len && !(flags & F_LJUST)) {
7823 do {
7824 --rescnt;
7825 *res++ = fill;
7826 } while (--width > len);
7827 }
Tim Peters38fd5b62000-09-21 05:43:11 +00007828 if (fill == ' ') {
7829 if (sign)
7830 *res++ = sign;
Tim Petersfff53252001-04-12 18:38:48 +00007831 if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
Tim Peters38fd5b62000-09-21 05:43:11 +00007832 assert(pbuf[0] == '0');
Tim Petersfff53252001-04-12 18:38:48 +00007833 assert(pbuf[1] == c);
Tim Peters38fd5b62000-09-21 05:43:11 +00007834 *res++ = *pbuf++;
7835 *res++ = *pbuf++;
7836 }
7837 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00007838 Py_UNICODE_COPY(res, pbuf, len);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007839 res += len;
7840 rescnt -= len;
7841 while (--width >= len) {
7842 --rescnt;
7843 *res++ = ' ';
7844 }
7845 if (dict && (argidx < arglen) && c != '%') {
7846 PyErr_SetString(PyExc_TypeError,
Raymond Hettinger0ebac972002-05-21 15:14:57 +00007847 "not all arguments converted during string formatting");
Thomas Woutersa96affe2006-03-12 00:29:36 +00007848 Py_XDECREF(temp);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007849 goto onError;
7850 }
7851 Py_XDECREF(temp);
7852 } /* '%' */
7853 } /* until end */
7854 if (argidx < arglen && !dict) {
7855 PyErr_SetString(PyExc_TypeError,
Raymond Hettinger0ebac972002-05-21 15:14:57 +00007856 "not all arguments converted during string formatting");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007857 goto onError;
7858 }
7859
Thomas Woutersa96affe2006-03-12 00:29:36 +00007860 if (_PyUnicode_Resize(&result, reslen - rescnt) < 0)
7861 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007862 if (args_owned) {
7863 Py_DECREF(args);
7864 }
7865 Py_DECREF(uformat);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007866 return (PyObject *)result;
7867
7868 onError:
7869 Py_XDECREF(result);
7870 Py_DECREF(uformat);
7871 if (args_owned) {
7872 Py_DECREF(args);
7873 }
7874 return NULL;
7875}
7876
7877static PyBufferProcs unicode_as_buffer = {
Martin v. Löwis18e16552006-02-15 17:27:45 +00007878 (readbufferproc) unicode_buffer_getreadbuf,
7879 (writebufferproc) unicode_buffer_getwritebuf,
7880 (segcountproc) unicode_buffer_getsegcount,
7881 (charbufferproc) unicode_buffer_getcharbuf,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007882};
7883
Jeremy Hylton938ace62002-07-17 16:30:39 +00007884static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +00007885unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
7886
Tim Peters6d6c1a32001-08-02 04:15:00 +00007887static PyObject *
7888unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
7889{
7890 PyObject *x = NULL;
Martin v. Löwis15e62742006-02-27 16:46:16 +00007891 static char *kwlist[] = {"string", "encoding", "errors", 0};
Tim Peters6d6c1a32001-08-02 04:15:00 +00007892 char *encoding = NULL;
7893 char *errors = NULL;
7894
Guido van Rossume023fe02001-08-30 03:12:59 +00007895 if (type != &PyUnicode_Type)
7896 return unicode_subtype_new(type, args, kwds);
Tim Peters6d6c1a32001-08-02 04:15:00 +00007897 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:unicode",
7898 kwlist, &x, &encoding, &errors))
7899 return NULL;
7900 if (x == NULL)
7901 return (PyObject *)_PyUnicode_New(0);
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00007902 if (encoding == NULL && errors == NULL)
7903 return PyObject_Unicode(x);
7904 else
Tim Peters6d6c1a32001-08-02 04:15:00 +00007905 return PyUnicode_FromEncodedObject(x, encoding, errors);
7906}
7907
Guido van Rossume023fe02001-08-30 03:12:59 +00007908static PyObject *
7909unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
7910{
Tim Petersaf90b3e2001-09-12 05:18:58 +00007911 PyUnicodeObject *tmp, *pnew;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007912 Py_ssize_t n;
Guido van Rossume023fe02001-08-30 03:12:59 +00007913
7914 assert(PyType_IsSubtype(type, &PyUnicode_Type));
7915 tmp = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds);
7916 if (tmp == NULL)
7917 return NULL;
7918 assert(PyUnicode_Check(tmp));
Tim Petersaf90b3e2001-09-12 05:18:58 +00007919 pnew = (PyUnicodeObject *) type->tp_alloc(type, n = tmp->length);
Raymond Hettingerf4667932003-06-28 20:04:25 +00007920 if (pnew == NULL) {
7921 Py_DECREF(tmp);
Guido van Rossume023fe02001-08-30 03:12:59 +00007922 return NULL;
Raymond Hettingerf4667932003-06-28 20:04:25 +00007923 }
Tim Petersaf90b3e2001-09-12 05:18:58 +00007924 pnew->str = PyMem_NEW(Py_UNICODE, n+1);
7925 if (pnew->str == NULL) {
7926 _Py_ForgetReference((PyObject *)pnew);
Neil Schemenauer58aa8612002-04-12 03:07:20 +00007927 PyObject_Del(pnew);
Raymond Hettingerf4667932003-06-28 20:04:25 +00007928 Py_DECREF(tmp);
Neal Norwitzec74f2f2003-02-11 23:05:40 +00007929 return PyErr_NoMemory();
Guido van Rossume023fe02001-08-30 03:12:59 +00007930 }
Tim Petersaf90b3e2001-09-12 05:18:58 +00007931 Py_UNICODE_COPY(pnew->str, tmp->str, n+1);
7932 pnew->length = n;
7933 pnew->hash = tmp->hash;
Guido van Rossume023fe02001-08-30 03:12:59 +00007934 Py_DECREF(tmp);
Tim Petersaf90b3e2001-09-12 05:18:58 +00007935 return (PyObject *)pnew;
Guido van Rossume023fe02001-08-30 03:12:59 +00007936}
7937
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007938PyDoc_STRVAR(unicode_doc,
Tim Peters6d6c1a32001-08-02 04:15:00 +00007939"unicode(string [, encoding[, errors]]) -> object\n\
7940\n\
7941Create a new Unicode object from the given encoded string.\n\
Skip Montanaro35b37a52002-07-26 16:22:46 +00007942encoding defaults to the current default string encoding.\n\
7943errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +00007944
Guido van Rossum50e9fb92006-08-17 05:42:55 +00007945static PyObject *unicode_iter(PyObject *seq);
7946
Guido van Rossumd57fd912000-03-10 22:53:23 +00007947PyTypeObject PyUnicode_Type = {
7948 PyObject_HEAD_INIT(&PyType_Type)
7949 0, /* ob_size */
Guido van Rossum84fc66d2007-05-03 17:18:26 +00007950 "str", /* tp_name */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007951 sizeof(PyUnicodeObject), /* tp_size */
7952 0, /* tp_itemsize */
7953 /* Slots */
Guido van Rossum9475a232001-10-05 20:51:39 +00007954 (destructor)unicode_dealloc, /* tp_dealloc */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007955 0, /* tp_print */
Tim Peters6d6c1a32001-08-02 04:15:00 +00007956 0, /* tp_getattr */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007957 0, /* tp_setattr */
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00007958 0, /* tp_compare */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007959 unicode_repr, /* tp_repr */
Neil Schemenauerce30bc92002-11-18 16:10:18 +00007960 &unicode_as_number, /* tp_as_number */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007961 &unicode_as_sequence, /* tp_as_sequence */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007962 &unicode_as_mapping, /* tp_as_mapping */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007963 (hashfunc) unicode_hash, /* tp_hash*/
7964 0, /* tp_call*/
7965 (reprfunc) unicode_str, /* tp_str */
Tim Peters6d6c1a32001-08-02 04:15:00 +00007966 PyObject_GenericGetAttr, /* tp_getattro */
7967 0, /* tp_setattro */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007968 &unicode_as_buffer, /* tp_as_buffer */
Thomas Wouters27d517b2007-02-25 20:39:11 +00007969 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
7970 Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
Tim Peters6d6c1a32001-08-02 04:15:00 +00007971 unicode_doc, /* tp_doc */
7972 0, /* tp_traverse */
7973 0, /* tp_clear */
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00007974 PyUnicode_RichCompare, /* tp_richcompare */
Tim Peters6d6c1a32001-08-02 04:15:00 +00007975 0, /* tp_weaklistoffset */
Guido van Rossum50e9fb92006-08-17 05:42:55 +00007976 unicode_iter, /* tp_iter */
Tim Peters6d6c1a32001-08-02 04:15:00 +00007977 0, /* tp_iternext */
7978 unicode_methods, /* tp_methods */
7979 0, /* tp_members */
7980 0, /* tp_getset */
Guido van Rossumcacfc072002-05-24 19:01:59 +00007981 &PyBaseString_Type, /* tp_base */
Tim Peters6d6c1a32001-08-02 04:15:00 +00007982 0, /* tp_dict */
7983 0, /* tp_descr_get */
7984 0, /* tp_descr_set */
7985 0, /* tp_dictoffset */
7986 0, /* tp_init */
7987 0, /* tp_alloc */
7988 unicode_new, /* tp_new */
Neil Schemenauer58aa8612002-04-12 03:07:20 +00007989 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007990};
7991
7992/* Initialize the Unicode implementation */
7993
Thomas Wouters78890102000-07-22 19:25:51 +00007994void _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007995{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00007996 int i;
7997
Thomas Wouters477c8d52006-05-27 19:21:47 +00007998 /* XXX - move this array to unicodectype.c ? */
7999 Py_UNICODE linebreak[] = {
8000 0x000A, /* LINE FEED */
8001 0x000D, /* CARRIAGE RETURN */
8002 0x001C, /* FILE SEPARATOR */
8003 0x001D, /* GROUP SEPARATOR */
8004 0x001E, /* RECORD SEPARATOR */
8005 0x0085, /* NEXT LINE */
8006 0x2028, /* LINE SEPARATOR */
8007 0x2029, /* PARAGRAPH SEPARATOR */
8008 };
8009
Fred Drakee4315f52000-05-09 19:53:39 +00008010 /* Init the implementation */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008011 unicode_freelist = NULL;
8012 unicode_freelist_size = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008013 unicode_empty = _PyUnicode_New(0);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008014 if (!unicode_empty)
8015 return;
8016
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008017 for (i = 0; i < 256; i++)
8018 unicode_latin1[i] = NULL;
Guido van Rossumcacfc072002-05-24 19:01:59 +00008019 if (PyType_Ready(&PyUnicode_Type) < 0)
8020 Py_FatalError("Can't initialize 'unicode'");
Thomas Wouters477c8d52006-05-27 19:21:47 +00008021
8022 /* initialize the linebreak bloom filter */
8023 bloom_linebreak = make_bloom_mask(
8024 linebreak, sizeof(linebreak) / sizeof(linebreak[0])
8025 );
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008026
8027 PyType_Ready(&EncodingMapType);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008028}
8029
8030/* Finalize the Unicode implementation */
8031
8032void
Thomas Wouters78890102000-07-22 19:25:51 +00008033_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008034{
Barry Warsaw5b4c2282000-10-03 20:45:26 +00008035 PyUnicodeObject *u;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008036 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008037
Guido van Rossum4ae8ef82000-10-03 18:09:04 +00008038 Py_XDECREF(unicode_empty);
8039 unicode_empty = NULL;
Barry Warsaw5b4c2282000-10-03 20:45:26 +00008040
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008041 for (i = 0; i < 256; i++) {
8042 if (unicode_latin1[i]) {
8043 Py_DECREF(unicode_latin1[i]);
8044 unicode_latin1[i] = NULL;
8045 }
8046 }
8047
Barry Warsaw5b4c2282000-10-03 20:45:26 +00008048 for (u = unicode_freelist; u != NULL;) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00008049 PyUnicodeObject *v = u;
8050 u = *(PyUnicodeObject **)u;
Guido van Rossumfd4b9572000-04-10 13:51:10 +00008051 if (v->str)
Guido van Rossumb18618d2000-05-03 23:44:39 +00008052 PyMem_DEL(v->str);
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00008053 Py_XDECREF(v->defenc);
Neil Schemenauer58aa8612002-04-12 03:07:20 +00008054 PyObject_Del(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008055 }
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008056 unicode_freelist = NULL;
8057 unicode_freelist_size = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008058}
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00008059
Guido van Rossum50e9fb92006-08-17 05:42:55 +00008060
8061
8062/********************* Unicode Iterator **************************/
8063
8064typedef struct {
8065 PyObject_HEAD
Guido van Rossum49d6b072006-08-17 21:11:47 +00008066 Py_ssize_t it_index;
Guido van Rossum50e9fb92006-08-17 05:42:55 +00008067 PyUnicodeObject *it_seq; /* Set to NULL when iterator is exhausted */
8068} unicodeiterobject;
8069
8070static void
8071unicodeiter_dealloc(unicodeiterobject *it)
8072{
8073 _PyObject_GC_UNTRACK(it);
8074 Py_XDECREF(it->it_seq);
8075 PyObject_GC_Del(it);
8076}
8077
8078static int
8079unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
8080{
8081 Py_VISIT(it->it_seq);
8082 return 0;
8083}
8084
8085static PyObject *
8086unicodeiter_next(unicodeiterobject *it)
8087{
8088 PyUnicodeObject *seq;
8089 PyObject *item;
8090
8091 assert(it != NULL);
8092 seq = it->it_seq;
8093 if (seq == NULL)
8094 return NULL;
8095 assert(PyUnicode_Check(seq));
8096
8097 if (it->it_index < PyUnicode_GET_SIZE(seq)) {
Guido van Rossum49d6b072006-08-17 21:11:47 +00008098 item = PyUnicode_FromUnicode(
8099 PyUnicode_AS_UNICODE(seq)+it->it_index, 1);
Guido van Rossum50e9fb92006-08-17 05:42:55 +00008100 if (item != NULL)
8101 ++it->it_index;
8102 return item;
8103 }
8104
8105 Py_DECREF(seq);
8106 it->it_seq = NULL;
8107 return NULL;
8108}
8109
8110static PyObject *
8111unicodeiter_len(unicodeiterobject *it)
8112{
8113 Py_ssize_t len = 0;
8114 if (it->it_seq)
8115 len = PyUnicode_GET_SIZE(it->it_seq) - it->it_index;
8116 return PyInt_FromSsize_t(len);
8117}
8118
8119PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
8120
8121static PyMethodDef unicodeiter_methods[] = {
Guido van Rossum49d6b072006-08-17 21:11:47 +00008122 {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
8123 length_hint_doc},
Guido van Rossum50e9fb92006-08-17 05:42:55 +00008124 {NULL, NULL} /* sentinel */
8125};
8126
8127PyTypeObject PyUnicodeIter_Type = {
8128 PyObject_HEAD_INIT(&PyType_Type)
8129 0, /* ob_size */
8130 "unicodeiterator", /* tp_name */
8131 sizeof(unicodeiterobject), /* tp_basicsize */
8132 0, /* tp_itemsize */
8133 /* methods */
Guido van Rossum49d6b072006-08-17 21:11:47 +00008134 (destructor)unicodeiter_dealloc, /* tp_dealloc */
Guido van Rossum50e9fb92006-08-17 05:42:55 +00008135 0, /* tp_print */
8136 0, /* tp_getattr */
8137 0, /* tp_setattr */
8138 0, /* tp_compare */
8139 0, /* tp_repr */
8140 0, /* tp_as_number */
8141 0, /* tp_as_sequence */
8142 0, /* tp_as_mapping */
8143 0, /* tp_hash */
8144 0, /* tp_call */
8145 0, /* tp_str */
8146 PyObject_GenericGetAttr, /* tp_getattro */
8147 0, /* tp_setattro */
8148 0, /* tp_as_buffer */
8149 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
8150 0, /* tp_doc */
8151 (traverseproc)unicodeiter_traverse, /* tp_traverse */
8152 0, /* tp_clear */
8153 0, /* tp_richcompare */
8154 0, /* tp_weaklistoffset */
8155 PyObject_SelfIter, /* tp_iter */
8156 (iternextfunc)unicodeiter_next, /* tp_iternext */
8157 unicodeiter_methods, /* tp_methods */
8158 0,
8159};
8160
8161static PyObject *
8162unicode_iter(PyObject *seq)
8163{
8164 unicodeiterobject *it;
8165
8166 if (!PyUnicode_Check(seq)) {
8167 PyErr_BadInternalCall();
8168 return NULL;
8169 }
8170 it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
8171 if (it == NULL)
8172 return NULL;
8173 it->it_index = 0;
8174 Py_INCREF(seq);
8175 it->it_seq = (PyUnicodeObject *)seq;
8176 _PyObject_GC_TRACK(it);
8177 return (PyObject *)it;
8178}
8179
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008180#ifdef __cplusplus
8181}
8182#endif
8183
8184
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00008185/*
8186Local variables:
8187c-basic-offset: 4
8188indent-tabs-mode: nil
8189End:
8190*/