blob: 70890736ae7ab01ae0e376116423b26d5bad12e3 [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Fred Drake785d14f2000-05-09 19:54:43 +00004modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
Guido van Rossumd57fd912000-03-10 22:53:23 +00005Unicode Integration Proposal (see file Misc/unicode.txt).
6
Guido van Rossum16b1ad92000-08-03 16:24:25 +00007Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +00008
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +00009--------------------------------------------------------------------
10The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000011
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000012 Copyright (c) 1999 by Secret Labs AB
13 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000014
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000015By obtaining, using, and/or copying this software and/or its
16associated documentation, you agree that you have read, understood,
17and will comply with the following terms and conditions:
18
19Permission to use, copy, modify, and distribute this software and its
20associated documentation for any purpose and without fee is hereby
21granted, provided that the above copyright notice appears in all
22copies, and that both that copyright notice and this permission notice
23appear in supporting documentation, and that the name of Secret Labs
24AB or the author not be used in advertising or publicity pertaining to
25distribution of the software without specific, written prior
26permission.
27
28SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
29THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
30FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
31ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
32WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
33ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
34OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
35--------------------------------------------------------------------
36
37*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000038
Martin v. Löwis5cb69362006-04-14 09:08:42 +000039#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000040#include "Python.h"
41
Guido van Rossumd57fd912000-03-10 22:53:23 +000042#include "unicodeobject.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000043#include "ucnhash.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000044
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000045#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000046#include <windows.h>
47#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000048
Fredrik Lundhb63588c2006-05-23 18:44:25 +000049#undef USE_INLINE /* XXX - set via configure? */
50
51#if defined(_MSC_VER) /* this is taken from _sre.c */
52#pragma warning(disable: 4710)
53/* fastest possible local call under MSVC */
54#define LOCAL(type) static __inline type __fastcall
55#elif defined(USE_INLINE)
56#define LOCAL(type) static inline type
57#else
58#define LOCAL(type) static type
59#endif
60
Guido van Rossumd57fd912000-03-10 22:53:23 +000061/* Limit for the Unicode object free list */
62
63#define MAX_UNICODE_FREELIST_SIZE 1024
64
65/* Limit for the Unicode object free list stay alive optimization.
66
67 The implementation will keep allocated Unicode memory intact for
68 all objects on the free list having a size less than this
Tim Petersced69f82003-09-16 20:30:58 +000069 limit. This reduces malloc() overhead for small Unicode objects.
Guido van Rossumd57fd912000-03-10 22:53:23 +000070
Barry Warsaw51ac5802000-03-20 16:36:48 +000071 At worst this will result in MAX_UNICODE_FREELIST_SIZE *
Guido van Rossumfd4b9572000-04-10 13:51:10 +000072 (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
Guido van Rossumd57fd912000-03-10 22:53:23 +000073 malloc()-overhead) bytes of unused garbage.
74
75 Setting the limit to 0 effectively turns the feature off.
76
Guido van Rossumfd4b9572000-04-10 13:51:10 +000077 Note: This is an experimental feature ! If you get core dumps when
78 using Unicode objects, turn this feature off.
Guido van Rossumd57fd912000-03-10 22:53:23 +000079
80*/
81
Guido van Rossumfd4b9572000-04-10 13:51:10 +000082#define KEEPALIVE_SIZE_LIMIT 9
Guido van Rossumd57fd912000-03-10 22:53:23 +000083
84/* Endianness switches; defaults to little endian */
85
86#ifdef WORDS_BIGENDIAN
87# define BYTEORDER_IS_BIG_ENDIAN
88#else
89# define BYTEORDER_IS_LITTLE_ENDIAN
90#endif
91
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000092/* --- Globals ------------------------------------------------------------
93
94 The globals are initialized by the _PyUnicode_Init() API and should
95 not be used before calling that API.
96
97*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000098
Anthony Baxterac6bd462006-04-13 02:06:09 +000099
100#ifdef __cplusplus
101extern "C" {
102#endif
103
Guido van Rossumd57fd912000-03-10 22:53:23 +0000104/* Free list for Unicode objects */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +0000105static PyUnicodeObject *unicode_freelist;
106static int unicode_freelist_size;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000107
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000108/* The empty Unicode object is shared to improve performance. */
109static PyUnicodeObject *unicode_empty;
110
111/* Single character Unicode strings in the Latin-1 range are being
112 shared as well. */
113static PyUnicodeObject *unicode_latin1[256];
114
Fred Drakee4315f52000-05-09 19:53:39 +0000115/* Default encoding to use and assume when NULL is passed as encoding
116 parameter; it is initialized by _PyUnicode_Init().
117
118 Always use the PyUnicode_SetDefaultEncoding() and
Tim Petersced69f82003-09-16 20:30:58 +0000119 PyUnicode_GetDefaultEncoding() APIs to access this global.
Fred Drakee4315f52000-05-09 19:53:39 +0000120
121*/
Fred Drakee4315f52000-05-09 19:53:39 +0000122static char unicode_default_encoding[100];
123
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000124Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000125PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000126{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000127#ifdef Py_UNICODE_WIDE
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000128 return 0x10FFFF;
129#else
130 /* This is actually an illegal character, so it should
131 not be passed to unichr. */
132 return 0xFFFF;
133#endif
134}
135
Fredrik Lundhb63588c2006-05-23 18:44:25 +0000136/* --- Bloom Filters ----------------------------------------------------- */
137
138/* stuff to implement simple "bloom filters" for Unicode characters.
139 to keep things simple, we use a single bitmask, using the least 5
140 bits from each unicode characters as the bit index. */
141
142/* the linebreak mask is set up by Unicode_Init below */
143
144#define BLOOM_MASK unsigned long
145
146static BLOOM_MASK bloom_linebreak;
147
148#define BLOOM(mask, ch) ((mask & (1 << ((ch) & 0x1F))))
149
150#define BLOOM_LINEBREAK(ch)\
151 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK((ch)))
152
153LOCAL(BLOOM_MASK) make_bloom_mask(Py_UNICODE* ptr, Py_ssize_t len)
154{
155 /* calculate simple bloom-style bitmask for a given unicode string */
156
157 long mask;
158 Py_ssize_t i;
159
160 mask = 0;
161 for (i = 0; i < len; i++)
162 mask |= (1 << (ptr[i] & 0x1F));
163
164 return mask;
165}
166
167LOCAL(int) unicode_member(Py_UNICODE chr, Py_UNICODE* set, Py_ssize_t setlen)
168{
169 Py_ssize_t i;
170
171 for (i = 0; i < setlen; i++)
172 if (set[i] == chr)
173 return 1;
174
Fredrik Lundh77633512006-05-23 19:47:35 +0000175 return 0;
Fredrik Lundhb63588c2006-05-23 18:44:25 +0000176}
177
178#define BLOOM_MEMBER(mask, chr, set, setlen)\
179 BLOOM(mask, chr) && unicode_member(chr, set, setlen)
180
Guido van Rossumd57fd912000-03-10 22:53:23 +0000181/* --- Unicode Object ----------------------------------------------------- */
182
183static
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000184int unicode_resize(register PyUnicodeObject *unicode,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000185 Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000186{
187 void *oldstr;
Tim Petersced69f82003-09-16 20:30:58 +0000188
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000189 /* Shortcut if there's nothing much to do. */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000190 if (unicode->length == length)
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000191 goto reset;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000192
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000193 /* Resizing shared object (unicode_empty or single character
194 objects) in-place is not allowed. Use PyUnicode_Resize()
195 instead ! */
Martin v. Löwis80d2e592006-04-13 06:06:08 +0000196 if (unicode == unicode_empty ||
197 (unicode->length == 1 &&
198 unicode->str[0] < 256U &&
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000199 unicode_latin1[unicode->str[0]] == unicode)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000200 PyErr_SetString(PyExc_SystemError,
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000201 "can't resize shared unicode objects");
Guido van Rossumd57fd912000-03-10 22:53:23 +0000202 return -1;
203 }
204
205 /* We allocate one more byte to make sure the string is
206 Ux0000 terminated -- XXX is this needed ? */
207 oldstr = unicode->str;
208 PyMem_RESIZE(unicode->str, Py_UNICODE, length + 1);
209 if (!unicode->str) {
Anthony Baxtera6286212006-04-11 07:42:36 +0000210 unicode->str = (Py_UNICODE *)oldstr;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000211 PyErr_NoMemory();
212 return -1;
213 }
214 unicode->str[length] = 0;
Martin v. Löwis412fb672006-04-13 06:34:32 +0000215 unicode->length = length;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000216
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000217 reset:
Guido van Rossumd57fd912000-03-10 22:53:23 +0000218 /* Reset the object caches */
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000219 if (unicode->defenc) {
220 Py_DECREF(unicode->defenc);
221 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000222 }
223 unicode->hash = -1;
Tim Petersced69f82003-09-16 20:30:58 +0000224
Guido van Rossumd57fd912000-03-10 22:53:23 +0000225 return 0;
226}
227
228/* We allocate one more byte to make sure the string is
Tim Petersced69f82003-09-16 20:30:58 +0000229 Ux0000 terminated -- XXX is this needed ?
Guido van Rossumd57fd912000-03-10 22:53:23 +0000230
231 XXX This allocator could further be enhanced by assuring that the
232 free list never reduces its size below 1.
233
234*/
235
236static
Martin v. Löwis18e16552006-02-15 17:27:45 +0000237PyUnicodeObject *_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000238{
239 register PyUnicodeObject *unicode;
240
Tim Petersced69f82003-09-16 20:30:58 +0000241 /* Optimization fo empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000242 if (length == 0 && unicode_empty != NULL) {
243 Py_INCREF(unicode_empty);
244 return unicode_empty;
245 }
246
247 /* Unicode freelist & memory allocation */
248 if (unicode_freelist) {
249 unicode = unicode_freelist;
Marc-André Lemburgbea47e72000-06-17 20:31:17 +0000250 unicode_freelist = *(PyUnicodeObject **)unicode;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000251 unicode_freelist_size--;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000252 if (unicode->str) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000253 /* Keep-Alive optimization: we only upsize the buffer,
254 never downsize it. */
Tim Petersced69f82003-09-16 20:30:58 +0000255 if ((unicode->length < length) &&
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +0000256 unicode_resize(unicode, length) < 0) {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000257 PyMem_DEL(unicode->str);
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000258 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000259 }
260 }
Guido van Rossumad98db12001-06-14 17:52:02 +0000261 else {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000262 unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
Guido van Rossumad98db12001-06-14 17:52:02 +0000263 }
264 PyObject_INIT(unicode, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000265 }
266 else {
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000267 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000268 if (unicode == NULL)
269 return NULL;
270 unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
271 }
272
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000273 if (!unicode->str) {
274 PyErr_NoMemory();
Barry Warsaw51ac5802000-03-20 16:36:48 +0000275 goto onError;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000276 }
Jeremy Hyltond8082792003-09-16 19:41:39 +0000277 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +0000278 * the caller fails before initializing str -- unicode_resize()
279 * reads str[0], and the Keep-Alive optimization can keep memory
280 * allocated for str alive across a call to unicode_dealloc(unicode).
281 * We don't want unicode_resize to read uninitialized memory in
282 * that case.
283 */
Jeremy Hyltond8082792003-09-16 19:41:39 +0000284 unicode->str[0] = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000285 unicode->str[length] = 0;
Martin v. Löwisf15da692006-04-13 07:24:50 +0000286 unicode->length = length;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000287 unicode->hash = -1;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000288 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000289 return unicode;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000290
291 onError:
292 _Py_ForgetReference((PyObject *)unicode);
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000293 PyObject_Del(unicode);
Barry Warsaw51ac5802000-03-20 16:36:48 +0000294 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000295}
296
297static
Guido van Rossum9475a232001-10-05 20:51:39 +0000298void unicode_dealloc(register PyUnicodeObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000299{
Guido van Rossum604ddf82001-12-06 20:03:56 +0000300 if (PyUnicode_CheckExact(unicode) &&
301 unicode_freelist_size < MAX_UNICODE_FREELIST_SIZE) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000302 /* Keep-Alive optimization */
303 if (unicode->length >= KEEPALIVE_SIZE_LIMIT) {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000304 PyMem_DEL(unicode->str);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000305 unicode->str = NULL;
306 unicode->length = 0;
307 }
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000308 if (unicode->defenc) {
309 Py_DECREF(unicode->defenc);
310 unicode->defenc = NULL;
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000311 }
312 /* Add to free list */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000313 *(PyUnicodeObject **)unicode = unicode_freelist;
314 unicode_freelist = unicode;
315 unicode_freelist_size++;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000316 }
317 else {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000318 PyMem_DEL(unicode->str);
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000319 Py_XDECREF(unicode->defenc);
Guido van Rossum604ddf82001-12-06 20:03:56 +0000320 unicode->ob_type->tp_free((PyObject *)unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000321 }
322}
323
Martin v. Löwis18e16552006-02-15 17:27:45 +0000324int PyUnicode_Resize(PyObject **unicode, Py_ssize_t length)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000325{
326 register PyUnicodeObject *v;
327
328 /* Argument checks */
329 if (unicode == NULL) {
330 PyErr_BadInternalCall();
331 return -1;
332 }
333 v = (PyUnicodeObject *)*unicode;
Guido van Rossum049cd6b2002-10-11 00:43:48 +0000334 if (v == NULL || !PyUnicode_Check(v) || v->ob_refcnt != 1 || length < 0) {
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000335 PyErr_BadInternalCall();
336 return -1;
337 }
338
339 /* Resizing unicode_empty and single character objects is not
340 possible since these are being shared. We simply return a fresh
341 copy with the same Unicode content. */
Tim Petersced69f82003-09-16 20:30:58 +0000342 if (v->length != length &&
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000343 (v == unicode_empty || v->length == 1)) {
344 PyUnicodeObject *w = _PyUnicode_New(length);
345 if (w == NULL)
346 return -1;
347 Py_UNICODE_COPY(w->str, v->str,
348 length < v->length ? length : v->length);
Raymond Hettingerc8df5782003-03-09 07:30:43 +0000349 Py_DECREF(*unicode);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000350 *unicode = (PyObject *)w;
351 return 0;
352 }
353
354 /* Note that we don't have to modify *unicode for unshared Unicode
355 objects, since we can modify them in-place. */
356 return unicode_resize(v, length);
357}
358
359/* Internal API for use in unicodeobject.c only ! */
360#define _PyUnicode_Resize(unicodevar, length) \
361 PyUnicode_Resize(((PyObject **)(unicodevar)), length)
362
Guido van Rossumd57fd912000-03-10 22:53:23 +0000363PyObject *PyUnicode_FromUnicode(const Py_UNICODE *u,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000364 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000365{
366 PyUnicodeObject *unicode;
367
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000368 /* If the Unicode data is known at construction time, we can apply
369 some optimizations which share commonly used objects. */
370 if (u != NULL) {
371
372 /* Optimization for empty strings */
373 if (size == 0 && unicode_empty != NULL) {
374 Py_INCREF(unicode_empty);
375 return (PyObject *)unicode_empty;
376 }
377
378 /* Single character Unicode objects in the Latin-1 range are
379 shared when using this constructor */
380 if (size == 1 && *u < 256) {
381 unicode = unicode_latin1[*u];
382 if (!unicode) {
383 unicode = _PyUnicode_New(1);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000384 if (!unicode)
385 return NULL;
Marc-André Lemburg8879a332001-06-07 12:26:56 +0000386 unicode->str[0] = *u;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000387 unicode_latin1[*u] = unicode;
388 }
389 Py_INCREF(unicode);
390 return (PyObject *)unicode;
391 }
392 }
Tim Petersced69f82003-09-16 20:30:58 +0000393
Guido van Rossumd57fd912000-03-10 22:53:23 +0000394 unicode = _PyUnicode_New(size);
395 if (!unicode)
396 return NULL;
397
398 /* Copy the Unicode data into the new object */
399 if (u != NULL)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000400 Py_UNICODE_COPY(unicode->str, u, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000401
402 return (PyObject *)unicode;
403}
404
405#ifdef HAVE_WCHAR_H
406
407PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000408 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000409{
410 PyUnicodeObject *unicode;
411
412 if (w == NULL) {
413 PyErr_BadInternalCall();
414 return NULL;
415 }
416
417 unicode = _PyUnicode_New(size);
418 if (!unicode)
419 return NULL;
420
421 /* Copy the wchar_t data into the new object */
422#ifdef HAVE_USABLE_WCHAR_T
423 memcpy(unicode->str, w, size * sizeof(wchar_t));
Tim Petersced69f82003-09-16 20:30:58 +0000424#else
Guido van Rossumd57fd912000-03-10 22:53:23 +0000425 {
426 register Py_UNICODE *u;
Martin v. Löwis412fb672006-04-13 06:34:32 +0000427 register Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000428 u = PyUnicode_AS_UNICODE(unicode);
Marc-André Lemburg204bd6d2004-10-15 07:45:05 +0000429 for (i = size; i > 0; i--)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000430 *u++ = *w++;
431 }
432#endif
433
434 return (PyObject *)unicode;
435}
436
Martin v. Löwis18e16552006-02-15 17:27:45 +0000437Py_ssize_t PyUnicode_AsWideChar(PyUnicodeObject *unicode,
438 wchar_t *w,
439 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000440{
441 if (unicode == NULL) {
442 PyErr_BadInternalCall();
443 return -1;
444 }
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +0000445
446 /* If possible, try to copy the 0-termination as well */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000447 if (size > PyUnicode_GET_SIZE(unicode))
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +0000448 size = PyUnicode_GET_SIZE(unicode) + 1;
449
Guido van Rossumd57fd912000-03-10 22:53:23 +0000450#ifdef HAVE_USABLE_WCHAR_T
451 memcpy(w, unicode->str, size * sizeof(wchar_t));
452#else
453 {
454 register Py_UNICODE *u;
Martin v. Löwis412fb672006-04-13 06:34:32 +0000455 register Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000456 u = PyUnicode_AS_UNICODE(unicode);
Marc-André Lemburg204bd6d2004-10-15 07:45:05 +0000457 for (i = size; i > 0; i--)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000458 *w++ = *u++;
459 }
460#endif
461
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +0000462 if (size > PyUnicode_GET_SIZE(unicode))
463 return PyUnicode_GET_SIZE(unicode);
464 else
Guido van Rossumd57fd912000-03-10 22:53:23 +0000465 return size;
466}
467
468#endif
469
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000470PyObject *PyUnicode_FromOrdinal(int ordinal)
471{
Hye-Shik Chang40574832004-04-06 07:24:51 +0000472 Py_UNICODE s[1];
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000473
474#ifdef Py_UNICODE_WIDE
475 if (ordinal < 0 || ordinal > 0x10ffff) {
476 PyErr_SetString(PyExc_ValueError,
477 "unichr() arg not in range(0x110000) "
478 "(wide Python build)");
479 return NULL;
480 }
481#else
482 if (ordinal < 0 || ordinal > 0xffff) {
483 PyErr_SetString(PyExc_ValueError,
484 "unichr() arg not in range(0x10000) "
485 "(narrow Python build)");
486 return NULL;
487 }
488#endif
489
Hye-Shik Chang40574832004-04-06 07:24:51 +0000490 s[0] = (Py_UNICODE)ordinal;
491 return PyUnicode_FromUnicode(s, 1);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000492}
493
Guido van Rossumd57fd912000-03-10 22:53:23 +0000494PyObject *PyUnicode_FromObject(register PyObject *obj)
495{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000496 /* XXX Perhaps we should make this API an alias of
497 PyObject_Unicode() instead ?! */
498 if (PyUnicode_CheckExact(obj)) {
499 Py_INCREF(obj);
500 return obj;
501 }
502 if (PyUnicode_Check(obj)) {
503 /* For a Unicode subtype that's not a Unicode object,
504 return a true Unicode object with the same data. */
505 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(obj),
506 PyUnicode_GET_SIZE(obj));
507 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000508 return PyUnicode_FromEncodedObject(obj, NULL, "strict");
509}
510
511PyObject *PyUnicode_FromEncodedObject(register PyObject *obj,
512 const char *encoding,
513 const char *errors)
514{
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000515 const char *s = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +0000516 Py_ssize_t len;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000517 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +0000518
Guido van Rossumd57fd912000-03-10 22:53:23 +0000519 if (obj == NULL) {
520 PyErr_BadInternalCall();
521 return NULL;
522 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000523
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000524#if 0
525 /* For b/w compatibility we also accept Unicode objects provided
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +0000526 that no encodings is given and then redirect to
527 PyObject_Unicode() which then applies the additional logic for
528 Unicode subclasses.
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000529
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000530 NOTE: This API should really only be used for object which
531 represent *encoded* Unicode !
532
533 */
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000534 if (PyUnicode_Check(obj)) {
535 if (encoding) {
536 PyErr_SetString(PyExc_TypeError,
537 "decoding Unicode is not supported");
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000538 return NULL;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000539 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000540 return PyObject_Unicode(obj);
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000541 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000542#else
543 if (PyUnicode_Check(obj)) {
544 PyErr_SetString(PyExc_TypeError,
545 "decoding Unicode is not supported");
546 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000547 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000548#endif
549
550 /* Coerce object */
551 if (PyString_Check(obj)) {
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000552 s = PyString_AS_STRING(obj);
553 len = PyString_GET_SIZE(obj);
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000554 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000555 else if (PyObject_AsCharBuffer(obj, &s, &len)) {
556 /* Overwrite the error message with something more useful in
557 case of a TypeError. */
558 if (PyErr_ExceptionMatches(PyExc_TypeError))
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000559 PyErr_Format(PyExc_TypeError,
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000560 "coercing to Unicode: need string or buffer, "
561 "%.80s found",
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000562 obj->ob_type->tp_name);
563 goto onError;
564 }
Tim Petersced69f82003-09-16 20:30:58 +0000565
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000566 /* Convert to Unicode */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000567 if (len == 0) {
568 Py_INCREF(unicode_empty);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000569 v = (PyObject *)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000570 }
Tim Petersced69f82003-09-16 20:30:58 +0000571 else
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000572 v = PyUnicode_Decode(s, len, encoding, errors);
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +0000573
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000574 return v;
575
576 onError:
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000577 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000578}
579
580PyObject *PyUnicode_Decode(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000581 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +0000582 const char *encoding,
583 const char *errors)
584{
585 PyObject *buffer = NULL, *unicode;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000586
587 if (encoding == NULL)
Fred Drakee4315f52000-05-09 19:53:39 +0000588 encoding = PyUnicode_GetDefaultEncoding();
589
590 /* Shortcuts for common default encodings */
591 if (strcmp(encoding, "utf-8") == 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000592 return PyUnicode_DecodeUTF8(s, size, errors);
Fred Drakee4315f52000-05-09 19:53:39 +0000593 else if (strcmp(encoding, "latin-1") == 0)
594 return PyUnicode_DecodeLatin1(s, size, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +0000595#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
596 else if (strcmp(encoding, "mbcs") == 0)
597 return PyUnicode_DecodeMBCS(s, size, errors);
598#endif
Fred Drakee4315f52000-05-09 19:53:39 +0000599 else if (strcmp(encoding, "ascii") == 0)
600 return PyUnicode_DecodeASCII(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000601
602 /* Decode via the codec registry */
603 buffer = PyBuffer_FromMemory((void *)s, size);
604 if (buffer == NULL)
605 goto onError;
606 unicode = PyCodec_Decode(buffer, encoding, errors);
607 if (unicode == NULL)
608 goto onError;
609 if (!PyUnicode_Check(unicode)) {
610 PyErr_Format(PyExc_TypeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000611 "decoder did not return an unicode object (type=%.400s)",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000612 unicode->ob_type->tp_name);
613 Py_DECREF(unicode);
614 goto onError;
615 }
616 Py_DECREF(buffer);
617 return unicode;
Tim Petersced69f82003-09-16 20:30:58 +0000618
Guido van Rossumd57fd912000-03-10 22:53:23 +0000619 onError:
620 Py_XDECREF(buffer);
621 return NULL;
622}
623
Marc-André Lemburgd2d45982004-07-08 17:57:32 +0000624PyObject *PyUnicode_AsDecodedObject(PyObject *unicode,
625 const char *encoding,
626 const char *errors)
627{
628 PyObject *v;
629
630 if (!PyUnicode_Check(unicode)) {
631 PyErr_BadArgument();
632 goto onError;
633 }
634
635 if (encoding == NULL)
636 encoding = PyUnicode_GetDefaultEncoding();
637
638 /* Decode via the codec registry */
639 v = PyCodec_Decode(unicode, encoding, errors);
640 if (v == NULL)
641 goto onError;
642 return v;
643
644 onError:
645 return NULL;
646}
647
Guido van Rossumd57fd912000-03-10 22:53:23 +0000648PyObject *PyUnicode_Encode(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000649 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +0000650 const char *encoding,
651 const char *errors)
652{
653 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +0000654
Guido van Rossumd57fd912000-03-10 22:53:23 +0000655 unicode = PyUnicode_FromUnicode(s, size);
656 if (unicode == NULL)
657 return NULL;
658 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
659 Py_DECREF(unicode);
660 return v;
661}
662
Marc-André Lemburgd2d45982004-07-08 17:57:32 +0000663PyObject *PyUnicode_AsEncodedObject(PyObject *unicode,
664 const char *encoding,
665 const char *errors)
666{
667 PyObject *v;
668
669 if (!PyUnicode_Check(unicode)) {
670 PyErr_BadArgument();
671 goto onError;
672 }
673
674 if (encoding == NULL)
675 encoding = PyUnicode_GetDefaultEncoding();
676
677 /* Encode via the codec registry */
678 v = PyCodec_Encode(unicode, encoding, errors);
679 if (v == NULL)
680 goto onError;
681 return v;
682
683 onError:
684 return NULL;
685}
686
Guido van Rossumd57fd912000-03-10 22:53:23 +0000687PyObject *PyUnicode_AsEncodedString(PyObject *unicode,
688 const char *encoding,
689 const char *errors)
690{
691 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +0000692
Guido van Rossumd57fd912000-03-10 22:53:23 +0000693 if (!PyUnicode_Check(unicode)) {
694 PyErr_BadArgument();
695 goto onError;
696 }
Fred Drakee4315f52000-05-09 19:53:39 +0000697
Tim Petersced69f82003-09-16 20:30:58 +0000698 if (encoding == NULL)
Fred Drakee4315f52000-05-09 19:53:39 +0000699 encoding = PyUnicode_GetDefaultEncoding();
700
701 /* Shortcuts for common default encodings */
702 if (errors == NULL) {
703 if (strcmp(encoding, "utf-8") == 0)
Jeremy Hylton9cea41c2001-05-29 17:13:15 +0000704 return PyUnicode_AsUTF8String(unicode);
Fred Drakee4315f52000-05-09 19:53:39 +0000705 else if (strcmp(encoding, "latin-1") == 0)
706 return PyUnicode_AsLatin1String(unicode);
Mark Hammond0ccda1e2003-07-01 00:13:27 +0000707#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
708 else if (strcmp(encoding, "mbcs") == 0)
709 return PyUnicode_AsMBCSString(unicode);
710#endif
Fred Drakee4315f52000-05-09 19:53:39 +0000711 else if (strcmp(encoding, "ascii") == 0)
712 return PyUnicode_AsASCIIString(unicode);
713 }
Guido van Rossumd57fd912000-03-10 22:53:23 +0000714
715 /* Encode via the codec registry */
716 v = PyCodec_Encode(unicode, encoding, errors);
717 if (v == NULL)
718 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000719 if (!PyString_Check(v)) {
720 PyErr_Format(PyExc_TypeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000721 "encoder did not return a string object (type=%.400s)",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000722 v->ob_type->tp_name);
723 Py_DECREF(v);
724 goto onError;
725 }
726 return v;
Tim Petersced69f82003-09-16 20:30:58 +0000727
Guido van Rossumd57fd912000-03-10 22:53:23 +0000728 onError:
729 return NULL;
730}
731
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000732PyObject *_PyUnicode_AsDefaultEncodedString(PyObject *unicode,
733 const char *errors)
734{
735 PyObject *v = ((PyUnicodeObject *)unicode)->defenc;
736
737 if (v)
738 return v;
739 v = PyUnicode_AsEncodedString(unicode, NULL, errors);
740 if (v && errors == NULL)
741 ((PyUnicodeObject *)unicode)->defenc = v;
742 return v;
743}
744
Guido van Rossumd57fd912000-03-10 22:53:23 +0000745Py_UNICODE *PyUnicode_AsUnicode(PyObject *unicode)
746{
747 if (!PyUnicode_Check(unicode)) {
748 PyErr_BadArgument();
749 goto onError;
750 }
751 return PyUnicode_AS_UNICODE(unicode);
752
753 onError:
754 return NULL;
755}
756
Martin v. Löwis18e16552006-02-15 17:27:45 +0000757Py_ssize_t PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000758{
759 if (!PyUnicode_Check(unicode)) {
760 PyErr_BadArgument();
761 goto onError;
762 }
763 return PyUnicode_GET_SIZE(unicode);
764
765 onError:
766 return -1;
767}
768
Thomas Wouters78890102000-07-22 19:25:51 +0000769const char *PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +0000770{
771 return unicode_default_encoding;
772}
773
774int PyUnicode_SetDefaultEncoding(const char *encoding)
775{
776 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +0000777
Fred Drakee4315f52000-05-09 19:53:39 +0000778 /* Make sure the encoding is valid. As side effect, this also
779 loads the encoding into the codec registry cache. */
780 v = _PyCodec_Lookup(encoding);
781 if (v == NULL)
782 goto onError;
783 Py_DECREF(v);
784 strncpy(unicode_default_encoding,
Tim Petersced69f82003-09-16 20:30:58 +0000785 encoding,
Fred Drakee4315f52000-05-09 19:53:39 +0000786 sizeof(unicode_default_encoding));
787 return 0;
788
789 onError:
790 return -1;
791}
792
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000793/* error handling callback helper:
794 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +0000795 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000796 and adjust various state variables.
797 return 0 on success, -1 on error
798*/
799
800static
801int unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
802 const char *encoding, const char *reason,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000803 const char *input, Py_ssize_t insize, Py_ssize_t *startinpos, Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
804 PyObject **output, Py_ssize_t *outpos, Py_UNICODE **outptr)
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000805{
Martin v. Löwis18e16552006-02-15 17:27:45 +0000806 static char *argparse = "O!n;decoding error handler must return (unicode, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000807
808 PyObject *restuple = NULL;
809 PyObject *repunicode = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +0000810 Py_ssize_t outsize = PyUnicode_GET_SIZE(*output);
811 Py_ssize_t requiredsize;
812 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000813 Py_UNICODE *repptr;
Martin v. Löwis18e16552006-02-15 17:27:45 +0000814 Py_ssize_t repsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000815 int res = -1;
816
817 if (*errorHandler == NULL) {
818 *errorHandler = PyCodec_LookupError(errors);
819 if (*errorHandler == NULL)
820 goto onError;
821 }
822
823 if (*exceptionObject == NULL) {
824 *exceptionObject = PyUnicodeDecodeError_Create(
825 encoding, input, insize, *startinpos, *endinpos, reason);
826 if (*exceptionObject == NULL)
827 goto onError;
828 }
829 else {
830 if (PyUnicodeDecodeError_SetStart(*exceptionObject, *startinpos))
831 goto onError;
832 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, *endinpos))
833 goto onError;
834 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
835 goto onError;
836 }
837
838 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
839 if (restuple == NULL)
840 goto onError;
841 if (!PyTuple_Check(restuple)) {
842 PyErr_Format(PyExc_TypeError, &argparse[4]);
843 goto onError;
844 }
845 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
846 goto onError;
847 if (newpos<0)
Walter Dörwald2e0b18a2003-01-31 17:19:08 +0000848 newpos = insize+newpos;
849 if (newpos<0 || newpos>insize) {
Martin v. Löwis2c95cc62006-02-16 06:54:25 +0000850 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
Walter Dörwald2e0b18a2003-01-31 17:19:08 +0000851 goto onError;
852 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000853
854 /* need more space? (at least enough for what we
855 have+the replacement+the rest of the string (starting
856 at the new input position), so we won't have to check space
857 when there are no errors in the rest of the string) */
858 repptr = PyUnicode_AS_UNICODE(repunicode);
859 repsize = PyUnicode_GET_SIZE(repunicode);
860 requiredsize = *outpos + repsize + insize-newpos;
861 if (requiredsize > outsize) {
862 if (requiredsize<2*outsize)
863 requiredsize = 2*outsize;
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +0000864 if (PyUnicode_Resize(output, requiredsize) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000865 goto onError;
866 *outptr = PyUnicode_AS_UNICODE(*output) + *outpos;
867 }
868 *endinpos = newpos;
869 *inptr = input + newpos;
870 Py_UNICODE_COPY(*outptr, repptr, repsize);
871 *outptr += repsize;
872 *outpos += repsize;
873 /* we made it! */
874 res = 0;
875
876 onError:
877 Py_XDECREF(restuple);
878 return res;
879}
880
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000881/* --- UTF-7 Codec -------------------------------------------------------- */
882
883/* see RFC2152 for details */
884
Tim Petersced69f82003-09-16 20:30:58 +0000885static
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000886char utf7_special[128] = {
887 /* indicate whether a UTF-7 character is special i.e. cannot be directly
888 encoded:
889 0 - not special
890 1 - special
891 2 - whitespace (optional)
892 3 - RFC2152 Set O (optional) */
893 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 2, 1, 1,
894 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
895 2, 3, 3, 3, 3, 3, 3, 0, 0, 0, 3, 1, 0, 0, 0, 1,
896 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 0,
897 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
898 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 1, 3, 3, 3,
899 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
900 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 1, 1,
901
902};
903
Marc-André Lemburge115ec82005-10-19 22:33:31 +0000904/* Note: The comparison (c) <= 0 is a trick to work-around gcc
905 warnings about the comparison always being false; since
906 utf7_special[0] is 1, we can safely make that one comparison
907 true */
908
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000909#define SPECIAL(c, encodeO, encodeWS) \
Marc-André Lemburge115ec82005-10-19 22:33:31 +0000910 ((c) > 127 || (c) <= 0 || utf7_special[(c)] == 1 || \
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +0000911 (encodeWS && (utf7_special[(c)] == 2)) || \
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000912 (encodeO && (utf7_special[(c)] == 3)))
913
Marc-André Lemburge115ec82005-10-19 22:33:31 +0000914#define B64(n) \
915 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
916#define B64CHAR(c) \
917 (isalnum(c) || (c) == '+' || (c) == '/')
918#define UB64(c) \
919 ((c) == '+' ? 62 : (c) == '/' ? 63 : (c) >= 'a' ? \
920 (c) - 71 : (c) >= 'A' ? (c) - 65 : (c) + 4 )
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000921
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +0000922#define ENCODE(out, ch, bits) \
923 while (bits >= 6) { \
924 *out++ = B64(ch >> (bits-6)); \
925 bits -= 6; \
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000926 }
927
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +0000928#define DECODE(out, ch, bits, surrogate) \
929 while (bits >= 16) { \
930 Py_UNICODE outCh = (Py_UNICODE) ((ch >> (bits-16)) & 0xffff); \
931 bits -= 16; \
932 if (surrogate) { \
Marc-André Lemburge115ec82005-10-19 22:33:31 +0000933 /* We have already generated an error for the high surrogate \
934 so let's not bother seeing if the low surrogate is correct or not */ \
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +0000935 surrogate = 0; \
936 } else if (0xDC00 <= outCh && outCh <= 0xDFFF) { \
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000937 /* This is a surrogate pair. Unfortunately we can't represent \
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +0000938 it in a 16-bit character */ \
939 surrogate = 1; \
940 errmsg = "code pairs are not supported"; \
941 goto utf7Error; \
942 } else { \
943 *out++ = outCh; \
944 } \
Marc-André Lemburge115ec82005-10-19 22:33:31 +0000945 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000946
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000947PyObject *PyUnicode_DecodeUTF7(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000948 Py_ssize_t size,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000949 const char *errors)
950{
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000951 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +0000952 Py_ssize_t startinpos;
953 Py_ssize_t endinpos;
954 Py_ssize_t outpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000955 const char *e;
956 PyUnicodeObject *unicode;
957 Py_UNICODE *p;
958 const char *errmsg = "";
959 int inShift = 0;
960 unsigned int bitsleft = 0;
961 unsigned long charsleft = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000962 int surrogate = 0;
963 PyObject *errorHandler = NULL;
964 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000965
966 unicode = _PyUnicode_New(size);
967 if (!unicode)
968 return NULL;
969 if (size == 0)
970 return (PyObject *)unicode;
971
972 p = unicode->str;
973 e = s + size;
974
975 while (s < e) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000976 Py_UNICODE ch;
977 restart:
978 ch = *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000979
980 if (inShift) {
981 if ((ch == '-') || !B64CHAR(ch)) {
982 inShift = 0;
983 s++;
Tim Petersced69f82003-09-16 20:30:58 +0000984
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000985 /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
986 if (bitsleft >= 6) {
987 /* The shift sequence has a partial character in it. If
988 bitsleft < 6 then we could just classify it as padding
989 but that is not the case here */
990
991 errmsg = "partial character in shift sequence";
Tim Petersced69f82003-09-16 20:30:58 +0000992 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000993 }
994 /* According to RFC2152 the remaining bits should be zero. We
Tim Petersced69f82003-09-16 20:30:58 +0000995 choose to signal an error/insert a replacement character
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000996 here so indicate the potential of a misencoded character. */
997
998 /* On x86, a << b == a << (b%32) so make sure that bitsleft != 0 */
999 if (bitsleft && charsleft << (sizeof(charsleft) * 8 - bitsleft)) {
1000 errmsg = "non-zero padding bits in shift sequence";
Tim Petersced69f82003-09-16 20:30:58 +00001001 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001002 }
1003
1004 if (ch == '-') {
1005 if ((s < e) && (*(s) == '-')) {
Tim Petersced69f82003-09-16 20:30:58 +00001006 *p++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001007 inShift = 1;
1008 }
1009 } else if (SPECIAL(ch,0,0)) {
1010 errmsg = "unexpected special character";
Tim Petersced69f82003-09-16 20:30:58 +00001011 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001012 } else {
1013 *p++ = ch;
1014 }
1015 } else {
1016 charsleft = (charsleft << 6) | UB64(ch);
1017 bitsleft += 6;
1018 s++;
1019 /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
1020 }
1021 }
1022 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001023 startinpos = s-starts;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001024 s++;
1025 if (s < e && *s == '-') {
1026 s++;
1027 *p++ = '+';
1028 } else
1029 {
1030 inShift = 1;
1031 bitsleft = 0;
1032 }
1033 }
1034 else if (SPECIAL(ch,0,0)) {
1035 errmsg = "unexpected special character";
1036 s++;
Tim Petersced69f82003-09-16 20:30:58 +00001037 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001038 }
1039 else {
1040 *p++ = ch;
1041 s++;
1042 }
1043 continue;
1044 utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001045 outpos = p-PyUnicode_AS_UNICODE(unicode);
1046 endinpos = s-starts;
1047 if (unicode_decode_call_errorhandler(
1048 errors, &errorHandler,
1049 "utf7", errmsg,
1050 starts, size, &startinpos, &endinpos, &exc, &s,
1051 (PyObject **)&unicode, &outpos, &p))
1052 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001053 }
1054
1055 if (inShift) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001056 outpos = p-PyUnicode_AS_UNICODE(unicode);
1057 endinpos = size;
1058 if (unicode_decode_call_errorhandler(
1059 errors, &errorHandler,
1060 "utf7", "unterminated shift sequence",
1061 starts, size, &startinpos, &endinpos, &exc, &s,
1062 (PyObject **)&unicode, &outpos, &p))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001063 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001064 if (s < e)
1065 goto restart;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001066 }
1067
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00001068 if (_PyUnicode_Resize(&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001069 goto onError;
1070
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001071 Py_XDECREF(errorHandler);
1072 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001073 return (PyObject *)unicode;
1074
1075onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001076 Py_XDECREF(errorHandler);
1077 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001078 Py_DECREF(unicode);
1079 return NULL;
1080}
1081
1082
1083PyObject *PyUnicode_EncodeUTF7(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001084 Py_ssize_t size,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001085 int encodeSetO,
1086 int encodeWhiteSpace,
1087 const char *errors)
1088{
1089 PyObject *v;
1090 /* It might be possible to tighten this worst case */
Martin v. Löwis18e16552006-02-15 17:27:45 +00001091 Py_ssize_t cbAllocated = 5 * size;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001092 int inShift = 0;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001093 Py_ssize_t i = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001094 unsigned int bitsleft = 0;
1095 unsigned long charsleft = 0;
1096 char * out;
1097 char * start;
1098
1099 if (size == 0)
1100 return PyString_FromStringAndSize(NULL, 0);
1101
1102 v = PyString_FromStringAndSize(NULL, cbAllocated);
1103 if (v == NULL)
1104 return NULL;
1105
1106 start = out = PyString_AS_STRING(v);
1107 for (;i < size; ++i) {
1108 Py_UNICODE ch = s[i];
1109
1110 if (!inShift) {
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001111 if (ch == '+') {
1112 *out++ = '+';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001113 *out++ = '-';
1114 } else if (SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
1115 charsleft = ch;
1116 bitsleft = 16;
1117 *out++ = '+';
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001118 /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001119 inShift = bitsleft > 0;
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001120 } else {
1121 *out++ = (char) ch;
1122 }
1123 } else {
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001124 if (!SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
1125 *out++ = B64(charsleft << (6-bitsleft));
1126 charsleft = 0;
1127 bitsleft = 0;
1128 /* Characters not in the BASE64 set implicitly unshift the sequence
1129 so no '-' is required, except if the character is itself a '-' */
1130 if (B64CHAR(ch) || ch == '-') {
1131 *out++ = '-';
1132 }
1133 inShift = 0;
1134 *out++ = (char) ch;
1135 } else {
1136 bitsleft += 16;
1137 charsleft = (charsleft << 16) | ch;
1138 /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
1139
1140 /* If the next character is special then we dont' need to terminate
Tim Petersced69f82003-09-16 20:30:58 +00001141 the shift sequence. If the next character is not a BASE64 character
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001142 or '-' then the shift sequence will be terminated implicitly and we
1143 don't have to insert a '-'. */
1144
1145 if (bitsleft == 0) {
1146 if (i + 1 < size) {
1147 Py_UNICODE ch2 = s[i+1];
1148
1149 if (SPECIAL(ch2, encodeSetO, encodeWhiteSpace)) {
Tim Petersced69f82003-09-16 20:30:58 +00001150
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001151 } else if (B64CHAR(ch2) || ch2 == '-') {
1152 *out++ = '-';
1153 inShift = 0;
1154 } else {
1155 inShift = 0;
1156 }
1157
1158 }
1159 else {
1160 *out++ = '-';
1161 inShift = 0;
1162 }
1163 }
Tim Petersced69f82003-09-16 20:30:58 +00001164 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001165 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001166 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001167 if (bitsleft) {
1168 *out++= B64(charsleft << (6-bitsleft) );
1169 *out++ = '-';
1170 }
1171
Tim Peters5de98422002-04-27 18:44:32 +00001172 _PyString_Resize(&v, out - start);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001173 return v;
1174}
1175
1176#undef SPECIAL
1177#undef B64
1178#undef B64CHAR
1179#undef UB64
1180#undef ENCODE
1181#undef DECODE
1182
Guido van Rossumd57fd912000-03-10 22:53:23 +00001183/* --- UTF-8 Codec -------------------------------------------------------- */
1184
Tim Petersced69f82003-09-16 20:30:58 +00001185static
Guido van Rossumd57fd912000-03-10 22:53:23 +00001186char utf8_code_length[256] = {
1187 /* Map UTF-8 encoded prefix byte to sequence length. zero means
1188 illegal prefix. see RFC 2279 for details */
1189 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1190 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1191 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1192 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1193 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1194 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1195 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1196 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1197 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1198 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1199 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1200 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1201 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1202 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1203 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
1204 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0
1205};
1206
Guido van Rossumd57fd912000-03-10 22:53:23 +00001207PyObject *PyUnicode_DecodeUTF8(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001208 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00001209 const char *errors)
1210{
Walter Dörwald69652032004-09-07 20:24:22 +00001211 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
1212}
1213
1214PyObject *PyUnicode_DecodeUTF8Stateful(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001215 Py_ssize_t size,
Walter Dörwald69652032004-09-07 20:24:22 +00001216 const char *errors,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001217 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00001218{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001219 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001220 int n;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001221 Py_ssize_t startinpos;
1222 Py_ssize_t endinpos;
1223 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001224 const char *e;
1225 PyUnicodeObject *unicode;
1226 Py_UNICODE *p;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001227 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001228 PyObject *errorHandler = NULL;
1229 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001230
1231 /* Note: size will always be longer than the resulting Unicode
1232 character count */
1233 unicode = _PyUnicode_New(size);
1234 if (!unicode)
1235 return NULL;
Walter Dörwald69652032004-09-07 20:24:22 +00001236 if (size == 0) {
1237 if (consumed)
1238 *consumed = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001239 return (PyObject *)unicode;
Walter Dörwald69652032004-09-07 20:24:22 +00001240 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001241
1242 /* Unpack UTF-8 encoded data */
1243 p = unicode->str;
1244 e = s + size;
1245
1246 while (s < e) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001247 Py_UCS4 ch = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001248
1249 if (ch < 0x80) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001250 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001251 s++;
1252 continue;
1253 }
1254
1255 n = utf8_code_length[ch];
1256
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001257 if (s + n > e) {
Walter Dörwald69652032004-09-07 20:24:22 +00001258 if (consumed)
1259 break;
1260 else {
1261 errmsg = "unexpected end of data";
1262 startinpos = s-starts;
1263 endinpos = size;
1264 goto utf8Error;
1265 }
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001266 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001267
1268 switch (n) {
1269
1270 case 0:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001271 errmsg = "unexpected code byte";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001272 startinpos = s-starts;
1273 endinpos = startinpos+1;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001274 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001275
1276 case 1:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001277 errmsg = "internal error";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001278 startinpos = s-starts;
1279 endinpos = startinpos+1;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001280 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001281
1282 case 2:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001283 if ((s[1] & 0xc0) != 0x80) {
1284 errmsg = "invalid data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001285 startinpos = s-starts;
1286 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001287 goto utf8Error;
1288 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001289 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001290 if (ch < 0x80) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001291 startinpos = s-starts;
1292 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001293 errmsg = "illegal encoding";
1294 goto utf8Error;
1295 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001296 else
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001297 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001298 break;
1299
1300 case 3:
Tim Petersced69f82003-09-16 20:30:58 +00001301 if ((s[1] & 0xc0) != 0x80 ||
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001302 (s[2] & 0xc0) != 0x80) {
1303 errmsg = "invalid data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001304 startinpos = s-starts;
1305 endinpos = startinpos+3;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001306 goto utf8Error;
1307 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001308 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001309 if (ch < 0x0800) {
1310 /* Note: UTF-8 encodings of surrogates are considered
Tim Petersced69f82003-09-16 20:30:58 +00001311 legal UTF-8 sequences;
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001312
1313 XXX For wide builds (UCS-4) we should probably try
1314 to recombine the surrogates into a single code
1315 unit.
1316 */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001317 errmsg = "illegal encoding";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001318 startinpos = s-starts;
1319 endinpos = startinpos+3;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001320 goto utf8Error;
1321 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001322 else
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001323 *p++ = (Py_UNICODE)ch;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001324 break;
1325
1326 case 4:
1327 if ((s[1] & 0xc0) != 0x80 ||
1328 (s[2] & 0xc0) != 0x80 ||
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001329 (s[3] & 0xc0) != 0x80) {
1330 errmsg = "invalid data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001331 startinpos = s-starts;
1332 endinpos = startinpos+4;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001333 goto utf8Error;
1334 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001335 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
1336 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
1337 /* validate and convert to UTF-16 */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001338 if ((ch < 0x10000) /* minimum value allowed for 4
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001339 byte encoding */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001340 || (ch > 0x10ffff)) /* maximum value allowed for
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001341 UTF-16 */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001342 {
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001343 errmsg = "illegal encoding";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001344 startinpos = s-starts;
1345 endinpos = startinpos+4;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001346 goto utf8Error;
1347 }
Fredrik Lundh8f455852001-06-27 18:59:43 +00001348#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001349 *p++ = (Py_UNICODE)ch;
1350#else
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001351 /* compute and append the two surrogates: */
Tim Petersced69f82003-09-16 20:30:58 +00001352
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001353 /* translate from 10000..10FFFF to 0..FFFF */
1354 ch -= 0x10000;
Tim Petersced69f82003-09-16 20:30:58 +00001355
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001356 /* high surrogate = top 10 bits added to D800 */
1357 *p++ = (Py_UNICODE)(0xD800 + (ch >> 10));
Tim Petersced69f82003-09-16 20:30:58 +00001358
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001359 /* low surrogate = bottom 10 bits added to DC00 */
Fredrik Lundh45714e92001-06-26 16:39:36 +00001360 *p++ = (Py_UNICODE)(0xDC00 + (ch & 0x03FF));
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001361#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00001362 break;
1363
1364 default:
1365 /* Other sizes are only needed for UCS-4 */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001366 errmsg = "unsupported Unicode code range";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001367 startinpos = s-starts;
1368 endinpos = startinpos+n;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001369 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001370 }
1371 s += n;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001372 continue;
Tim Petersced69f82003-09-16 20:30:58 +00001373
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001374 utf8Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001375 outpos = p-PyUnicode_AS_UNICODE(unicode);
1376 if (unicode_decode_call_errorhandler(
1377 errors, &errorHandler,
1378 "utf8", errmsg,
1379 starts, size, &startinpos, &endinpos, &exc, &s,
1380 (PyObject **)&unicode, &outpos, &p))
1381 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001382 }
Walter Dörwald69652032004-09-07 20:24:22 +00001383 if (consumed)
1384 *consumed = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001385
1386 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00001387 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001388 goto onError;
1389
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001390 Py_XDECREF(errorHandler);
1391 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001392 return (PyObject *)unicode;
1393
1394onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001395 Py_XDECREF(errorHandler);
1396 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001397 Py_DECREF(unicode);
1398 return NULL;
1399}
1400
Tim Peters602f7402002-04-27 18:03:26 +00001401/* Allocation strategy: if the string is short, convert into a stack buffer
1402 and allocate exactly as much space needed at the end. Else allocate the
1403 maximum possible needed (4 result bytes per Unicode character), and return
1404 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001405*/
Tim Peters7e3d9612002-04-21 03:26:37 +00001406PyObject *
1407PyUnicode_EncodeUTF8(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001408 Py_ssize_t size,
Tim Peters7e3d9612002-04-21 03:26:37 +00001409 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001410{
Tim Peters602f7402002-04-27 18:03:26 +00001411#define MAX_SHORT_UNICHARS 300 /* largest size we'll do on the stack */
Tim Peters0eca65c2002-04-21 17:28:06 +00001412
Martin v. Löwis18e16552006-02-15 17:27:45 +00001413 Py_ssize_t i; /* index into s of next input byte */
Tim Peters602f7402002-04-27 18:03:26 +00001414 PyObject *v; /* result string object */
1415 char *p; /* next free byte in output buffer */
Martin v. Löwis18e16552006-02-15 17:27:45 +00001416 Py_ssize_t nallocated; /* number of result bytes allocated */
Martin v. Löwis412fb672006-04-13 06:34:32 +00001417 Py_ssize_t nneeded; /* number of result bytes needed */
Tim Peters602f7402002-04-27 18:03:26 +00001418 char stackbuf[MAX_SHORT_UNICHARS * 4];
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001419
Tim Peters602f7402002-04-27 18:03:26 +00001420 assert(s != NULL);
1421 assert(size >= 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001422
Tim Peters602f7402002-04-27 18:03:26 +00001423 if (size <= MAX_SHORT_UNICHARS) {
1424 /* Write into the stack buffer; nallocated can't overflow.
1425 * At the end, we'll allocate exactly as much heap space as it
1426 * turns out we need.
1427 */
1428 nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int);
1429 v = NULL; /* will allocate after we're done */
1430 p = stackbuf;
1431 }
1432 else {
1433 /* Overallocate on the heap, and give the excess back at the end. */
1434 nallocated = size * 4;
1435 if (nallocated / 4 != size) /* overflow! */
1436 return PyErr_NoMemory();
1437 v = PyString_FromStringAndSize(NULL, nallocated);
1438 if (v == NULL)
1439 return NULL;
1440 p = PyString_AS_STRING(v);
1441 }
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001442
Tim Peters602f7402002-04-27 18:03:26 +00001443 for (i = 0; i < size;) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001444 Py_UCS4 ch = s[i++];
Marc-André Lemburg3688a882002-02-06 18:09:02 +00001445
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001446 if (ch < 0x80)
Tim Peters602f7402002-04-27 18:03:26 +00001447 /* Encode ASCII */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001448 *p++ = (char) ch;
Marc-André Lemburg3688a882002-02-06 18:09:02 +00001449
Guido van Rossumd57fd912000-03-10 22:53:23 +00001450 else if (ch < 0x0800) {
Tim Peters602f7402002-04-27 18:03:26 +00001451 /* Encode Latin-1 */
Marc-André Lemburgdc724d62002-02-06 18:20:19 +00001452 *p++ = (char)(0xc0 | (ch >> 6));
1453 *p++ = (char)(0x80 | (ch & 0x3f));
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001454 }
Marc-André Lemburg3688a882002-02-06 18:09:02 +00001455 else {
Tim Peters602f7402002-04-27 18:03:26 +00001456 /* Encode UCS2 Unicode ordinals */
1457 if (ch < 0x10000) {
1458 /* Special case: check for high surrogate */
1459 if (0xD800 <= ch && ch <= 0xDBFF && i != size) {
1460 Py_UCS4 ch2 = s[i];
1461 /* Check for low surrogate and combine the two to
1462 form a UCS4 value */
1463 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001464 ch = ((ch - 0xD800) << 10 | (ch2 - 0xDC00)) + 0x10000;
Tim Peters602f7402002-04-27 18:03:26 +00001465 i++;
1466 goto encodeUCS4;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001467 }
Tim Peters602f7402002-04-27 18:03:26 +00001468 /* Fall through: handles isolated high surrogates */
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001469 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001470 *p++ = (char)(0xe0 | (ch >> 12));
Tim Peters602f7402002-04-27 18:03:26 +00001471 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
1472 *p++ = (char)(0x80 | (ch & 0x3f));
1473 continue;
1474 }
1475encodeUCS4:
1476 /* Encode UCS4 Unicode ordinals */
1477 *p++ = (char)(0xf0 | (ch >> 18));
1478 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
1479 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
1480 *p++ = (char)(0x80 | (ch & 0x3f));
1481 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001482 }
Tim Peters0eca65c2002-04-21 17:28:06 +00001483
Tim Peters602f7402002-04-27 18:03:26 +00001484 if (v == NULL) {
1485 /* This was stack allocated. */
Martin v. Löwis412fb672006-04-13 06:34:32 +00001486 nneeded = p - stackbuf;
Tim Peters602f7402002-04-27 18:03:26 +00001487 assert(nneeded <= nallocated);
1488 v = PyString_FromStringAndSize(stackbuf, nneeded);
1489 }
1490 else {
1491 /* Cut back to size actually needed. */
Martin v. Löwis412fb672006-04-13 06:34:32 +00001492 nneeded = p - PyString_AS_STRING(v);
Tim Peters602f7402002-04-27 18:03:26 +00001493 assert(nneeded <= nallocated);
1494 _PyString_Resize(&v, nneeded);
1495 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001496 return v;
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001497
Tim Peters602f7402002-04-27 18:03:26 +00001498#undef MAX_SHORT_UNICHARS
Guido van Rossumd57fd912000-03-10 22:53:23 +00001499}
1500
Guido van Rossumd57fd912000-03-10 22:53:23 +00001501PyObject *PyUnicode_AsUTF8String(PyObject *unicode)
1502{
Guido van Rossumd57fd912000-03-10 22:53:23 +00001503 if (!PyUnicode_Check(unicode)) {
1504 PyErr_BadArgument();
1505 return NULL;
1506 }
Barry Warsaw2dd4abf2000-08-18 06:58:15 +00001507 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
1508 PyUnicode_GET_SIZE(unicode),
1509 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001510}
1511
1512/* --- UTF-16 Codec ------------------------------------------------------- */
1513
Tim Peters772747b2001-08-09 22:21:55 +00001514PyObject *
1515PyUnicode_DecodeUTF16(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001516 Py_ssize_t size,
Tim Peters772747b2001-08-09 22:21:55 +00001517 const char *errors,
1518 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001519{
Walter Dörwald69652032004-09-07 20:24:22 +00001520 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
1521}
1522
1523PyObject *
1524PyUnicode_DecodeUTF16Stateful(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001525 Py_ssize_t size,
Walter Dörwald69652032004-09-07 20:24:22 +00001526 const char *errors,
1527 int *byteorder,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001528 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00001529{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001530 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001531 Py_ssize_t startinpos;
1532 Py_ssize_t endinpos;
1533 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001534 PyUnicodeObject *unicode;
1535 Py_UNICODE *p;
Tim Peters772747b2001-08-09 22:21:55 +00001536 const unsigned char *q, *e;
1537 int bo = 0; /* assume native ordering by default */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001538 const char *errmsg = "";
Tim Peters772747b2001-08-09 22:21:55 +00001539 /* Offsets from q for retrieving byte pairs in the right order. */
1540#ifdef BYTEORDER_IS_LITTLE_ENDIAN
1541 int ihi = 1, ilo = 0;
1542#else
1543 int ihi = 0, ilo = 1;
1544#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001545 PyObject *errorHandler = NULL;
1546 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001547
1548 /* Note: size will always be longer than the resulting Unicode
1549 character count */
1550 unicode = _PyUnicode_New(size);
1551 if (!unicode)
1552 return NULL;
1553 if (size == 0)
1554 return (PyObject *)unicode;
1555
1556 /* Unpack UTF-16 encoded data */
1557 p = unicode->str;
Tim Peters772747b2001-08-09 22:21:55 +00001558 q = (unsigned char *)s;
1559 e = q + size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001560
1561 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00001562 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001563
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001564 /* Check for BOM marks (U+FEFF) in the input and adjust current
1565 byte order setting accordingly. In native mode, the leading BOM
1566 mark is skipped, in all other modes, it is copied to the output
1567 stream as-is (giving a ZWNBSP character). */
1568 if (bo == 0) {
Walter Dörwald69652032004-09-07 20:24:22 +00001569 if (size >= 2) {
1570 const Py_UNICODE bom = (q[ihi] << 8) | q[ilo];
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001571#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Walter Dörwald69652032004-09-07 20:24:22 +00001572 if (bom == 0xFEFF) {
1573 q += 2;
1574 bo = -1;
1575 }
1576 else if (bom == 0xFFFE) {
1577 q += 2;
1578 bo = 1;
1579 }
Tim Petersced69f82003-09-16 20:30:58 +00001580#else
Walter Dörwald69652032004-09-07 20:24:22 +00001581 if (bom == 0xFEFF) {
1582 q += 2;
1583 bo = 1;
1584 }
1585 else if (bom == 0xFFFE) {
1586 q += 2;
1587 bo = -1;
1588 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001589#endif
Walter Dörwald69652032004-09-07 20:24:22 +00001590 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001591 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001592
Tim Peters772747b2001-08-09 22:21:55 +00001593 if (bo == -1) {
1594 /* force LE */
1595 ihi = 1;
1596 ilo = 0;
1597 }
1598 else if (bo == 1) {
1599 /* force BE */
1600 ihi = 0;
1601 ilo = 1;
1602 }
1603
1604 while (q < e) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001605 Py_UNICODE ch;
Walter Dörwald69652032004-09-07 20:24:22 +00001606 /* remaining bytes at the end? (size should be even) */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001607 if (e-q<2) {
Walter Dörwald69652032004-09-07 20:24:22 +00001608 if (consumed)
1609 break;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001610 errmsg = "truncated data";
1611 startinpos = ((const char *)q)-starts;
1612 endinpos = ((const char *)e)-starts;
1613 goto utf16Error;
1614 /* The remaining input chars are ignored if the callback
1615 chooses to skip the input */
1616 }
1617 ch = (q[ihi] << 8) | q[ilo];
1618
Tim Peters772747b2001-08-09 22:21:55 +00001619 q += 2;
1620
Guido van Rossumd57fd912000-03-10 22:53:23 +00001621 if (ch < 0xD800 || ch > 0xDFFF) {
1622 *p++ = ch;
1623 continue;
1624 }
1625
1626 /* UTF-16 code pair: */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001627 if (q >= e) {
1628 errmsg = "unexpected end of data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001629 startinpos = (((const char *)q)-2)-starts;
1630 endinpos = ((const char *)e)-starts;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001631 goto utf16Error;
1632 }
Martin v. Löwisac93bc22001-06-26 22:43:40 +00001633 if (0xD800 <= ch && ch <= 0xDBFF) {
Tim Peters772747b2001-08-09 22:21:55 +00001634 Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo];
1635 q += 2;
Martin v. Löwisac93bc22001-06-26 22:43:40 +00001636 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Fredrik Lundh8f455852001-06-27 18:59:43 +00001637#ifndef Py_UNICODE_WIDE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001638 *p++ = ch;
1639 *p++ = ch2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001640#else
1641 *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001642#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001643 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001644 }
1645 else {
1646 errmsg = "illegal UTF-16 surrogate";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001647 startinpos = (((const char *)q)-4)-starts;
1648 endinpos = startinpos+2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001649 goto utf16Error;
1650 }
1651
Guido van Rossumd57fd912000-03-10 22:53:23 +00001652 }
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001653 errmsg = "illegal encoding";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001654 startinpos = (((const char *)q)-2)-starts;
1655 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001656 /* Fall through to report the error */
1657
1658 utf16Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001659 outpos = p-PyUnicode_AS_UNICODE(unicode);
1660 if (unicode_decode_call_errorhandler(
1661 errors, &errorHandler,
1662 "utf16", errmsg,
1663 starts, size, &startinpos, &endinpos, &exc, (const char **)&q,
1664 (PyObject **)&unicode, &outpos, &p))
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001665 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001666 }
1667
1668 if (byteorder)
1669 *byteorder = bo;
1670
Walter Dörwald69652032004-09-07 20:24:22 +00001671 if (consumed)
1672 *consumed = (const char *)q-starts;
1673
Guido van Rossumd57fd912000-03-10 22:53:23 +00001674 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00001675 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001676 goto onError;
1677
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001678 Py_XDECREF(errorHandler);
1679 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001680 return (PyObject *)unicode;
1681
1682onError:
1683 Py_DECREF(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001684 Py_XDECREF(errorHandler);
1685 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001686 return NULL;
1687}
1688
Tim Peters772747b2001-08-09 22:21:55 +00001689PyObject *
1690PyUnicode_EncodeUTF16(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001691 Py_ssize_t size,
Tim Peters772747b2001-08-09 22:21:55 +00001692 const char *errors,
1693 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001694{
1695 PyObject *v;
Tim Peters772747b2001-08-09 22:21:55 +00001696 unsigned char *p;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00001697#ifdef Py_UNICODE_WIDE
Tim Peters772747b2001-08-09 22:21:55 +00001698 int i, pairs;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00001699#else
1700 const int pairs = 0;
1701#endif
Tim Peters772747b2001-08-09 22:21:55 +00001702 /* Offsets from p for storing byte pairs in the right order. */
1703#ifdef BYTEORDER_IS_LITTLE_ENDIAN
1704 int ihi = 1, ilo = 0;
1705#else
1706 int ihi = 0, ilo = 1;
1707#endif
1708
1709#define STORECHAR(CH) \
1710 do { \
1711 p[ihi] = ((CH) >> 8) & 0xff; \
1712 p[ilo] = (CH) & 0xff; \
1713 p += 2; \
1714 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001715
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00001716#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001717 for (i = pairs = 0; i < size; i++)
1718 if (s[i] >= 0x10000)
1719 pairs++;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00001720#endif
Tim Petersced69f82003-09-16 20:30:58 +00001721 v = PyString_FromStringAndSize(NULL,
Tim Peters772747b2001-08-09 22:21:55 +00001722 2 * (size + pairs + (byteorder == 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +00001723 if (v == NULL)
1724 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001725
Tim Peters772747b2001-08-09 22:21:55 +00001726 p = (unsigned char *)PyString_AS_STRING(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001727 if (byteorder == 0)
Tim Peters772747b2001-08-09 22:21:55 +00001728 STORECHAR(0xFEFF);
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00001729 if (size == 0)
Marc-André Lemburgb7520772000-08-14 11:29:19 +00001730 return v;
Tim Peters772747b2001-08-09 22:21:55 +00001731
1732 if (byteorder == -1) {
1733 /* force LE */
1734 ihi = 1;
1735 ilo = 0;
1736 }
1737 else if (byteorder == 1) {
1738 /* force BE */
1739 ihi = 0;
1740 ilo = 1;
1741 }
1742
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001743 while (size-- > 0) {
1744 Py_UNICODE ch = *s++;
1745 Py_UNICODE ch2 = 0;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00001746#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001747 if (ch >= 0x10000) {
Tim Peters772747b2001-08-09 22:21:55 +00001748 ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF);
1749 ch = 0xD800 | ((ch-0x10000) >> 10);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001750 }
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00001751#endif
Tim Peters772747b2001-08-09 22:21:55 +00001752 STORECHAR(ch);
1753 if (ch2)
1754 STORECHAR(ch2);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001755 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001756 return v;
Tim Peters772747b2001-08-09 22:21:55 +00001757#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00001758}
1759
1760PyObject *PyUnicode_AsUTF16String(PyObject *unicode)
1761{
1762 if (!PyUnicode_Check(unicode)) {
1763 PyErr_BadArgument();
1764 return NULL;
1765 }
1766 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
1767 PyUnicode_GET_SIZE(unicode),
1768 NULL,
1769 0);
1770}
1771
1772/* --- Unicode Escape Codec ----------------------------------------------- */
1773
Fredrik Lundh06d12682001-01-24 07:59:11 +00001774static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00001775
Guido van Rossumd57fd912000-03-10 22:53:23 +00001776PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001777 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00001778 const char *errors)
1779{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001780 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001781 Py_ssize_t startinpos;
1782 Py_ssize_t endinpos;
1783 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001784 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001785 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001786 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001787 const char *end;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001788 char* message;
1789 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001790 PyObject *errorHandler = NULL;
1791 PyObject *exc = NULL;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001792
Guido van Rossumd57fd912000-03-10 22:53:23 +00001793 /* Escaped strings will always be longer than the resulting
1794 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001795 length after conversion to the true value.
1796 (but if the error callback returns a long replacement string
1797 we'll have to allocate more space) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001798 v = _PyUnicode_New(size);
1799 if (v == NULL)
1800 goto onError;
1801 if (size == 0)
1802 return (PyObject *)v;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001803
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001804 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001805 end = s + size;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001806
Guido van Rossumd57fd912000-03-10 22:53:23 +00001807 while (s < end) {
1808 unsigned char c;
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00001809 Py_UNICODE x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001810 int digits;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001811
1812 /* Non-escape characters are interpreted as Unicode ordinals */
1813 if (*s != '\\') {
Fredrik Lundhccc74732001-02-18 22:13:49 +00001814 *p++ = (unsigned char) *s++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001815 continue;
1816 }
1817
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001818 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001819 /* \ - Escapes */
1820 s++;
1821 switch (*s++) {
1822
1823 /* \x escapes */
1824 case '\n': break;
1825 case '\\': *p++ = '\\'; break;
1826 case '\'': *p++ = '\''; break;
1827 case '\"': *p++ = '\"'; break;
1828 case 'b': *p++ = '\b'; break;
1829 case 'f': *p++ = '\014'; break; /* FF */
1830 case 't': *p++ = '\t'; break;
1831 case 'n': *p++ = '\n'; break;
1832 case 'r': *p++ = '\r'; break;
1833 case 'v': *p++ = '\013'; break; /* VT */
1834 case 'a': *p++ = '\007'; break; /* BEL, not classic C */
1835
1836 /* \OOO (octal) escapes */
1837 case '0': case '1': case '2': case '3':
1838 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001839 x = s[-1] - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00001840 if ('0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001841 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00001842 if ('0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001843 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00001844 }
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001845 *p++ = x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001846 break;
1847
Fredrik Lundhccc74732001-02-18 22:13:49 +00001848 /* hex escapes */
1849 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001850 case 'x':
Fredrik Lundhccc74732001-02-18 22:13:49 +00001851 digits = 2;
1852 message = "truncated \\xXX escape";
1853 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001854
Fredrik Lundhccc74732001-02-18 22:13:49 +00001855 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001856 case 'u':
Fredrik Lundhccc74732001-02-18 22:13:49 +00001857 digits = 4;
1858 message = "truncated \\uXXXX escape";
1859 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001860
Fredrik Lundhccc74732001-02-18 22:13:49 +00001861 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00001862 case 'U':
Fredrik Lundhccc74732001-02-18 22:13:49 +00001863 digits = 8;
1864 message = "truncated \\UXXXXXXXX escape";
1865 hexescape:
1866 chr = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001867 outpos = p-PyUnicode_AS_UNICODE(v);
1868 if (s+digits>end) {
1869 endinpos = size;
1870 if (unicode_decode_call_errorhandler(
1871 errors, &errorHandler,
1872 "unicodeescape", "end of string in escape sequence",
1873 starts, size, &startinpos, &endinpos, &exc, &s,
1874 (PyObject **)&v, &outpos, &p))
1875 goto onError;
1876 goto nextByte;
1877 }
1878 for (i = 0; i < digits; ++i) {
Fredrik Lundhccc74732001-02-18 22:13:49 +00001879 c = (unsigned char) s[i];
Fredrik Lundhdf846752000-09-03 11:29:49 +00001880 if (!isxdigit(c)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001881 endinpos = (s+i+1)-starts;
1882 if (unicode_decode_call_errorhandler(
1883 errors, &errorHandler,
1884 "unicodeescape", message,
1885 starts, size, &startinpos, &endinpos, &exc, &s,
1886 (PyObject **)&v, &outpos, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00001887 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001888 goto nextByte;
Fredrik Lundhdf846752000-09-03 11:29:49 +00001889 }
1890 chr = (chr<<4) & ~0xF;
1891 if (c >= '0' && c <= '9')
1892 chr += c - '0';
1893 else if (c >= 'a' && c <= 'f')
1894 chr += 10 + c - 'a';
1895 else
1896 chr += 10 + c - 'A';
1897 }
1898 s += i;
Jeremy Hylton504de6b2003-10-06 05:08:26 +00001899 if (chr == 0xffffffff && PyErr_Occurred())
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001900 /* _decoding_error will have already written into the
1901 target buffer. */
1902 break;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001903 store:
Fredrik Lundhdf846752000-09-03 11:29:49 +00001904 /* when we get here, chr is a 32-bit unicode character */
1905 if (chr <= 0xffff)
1906 /* UCS-2 character */
1907 *p++ = (Py_UNICODE) chr;
1908 else if (chr <= 0x10ffff) {
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001909 /* UCS-4 character. Either store directly, or as
Walter Dörwald8c077222002-03-25 11:16:18 +00001910 surrogate pair. */
Fredrik Lundh8f455852001-06-27 18:59:43 +00001911#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001912 *p++ = chr;
1913#else
Fredrik Lundhdf846752000-09-03 11:29:49 +00001914 chr -= 0x10000L;
1915 *p++ = 0xD800 + (Py_UNICODE) (chr >> 10);
Fredrik Lundh45714e92001-06-26 16:39:36 +00001916 *p++ = 0xDC00 + (Py_UNICODE) (chr & 0x03FF);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001917#endif
Fredrik Lundhdf846752000-09-03 11:29:49 +00001918 } else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001919 endinpos = s-starts;
1920 outpos = p-PyUnicode_AS_UNICODE(v);
1921 if (unicode_decode_call_errorhandler(
1922 errors, &errorHandler,
1923 "unicodeescape", "illegal Unicode character",
1924 starts, size, &startinpos, &endinpos, &exc, &s,
1925 (PyObject **)&v, &outpos, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00001926 goto onError;
1927 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00001928 break;
1929
1930 /* \N{name} */
1931 case 'N':
1932 message = "malformed \\N character escape";
1933 if (ucnhash_CAPI == NULL) {
1934 /* load the unicode data module */
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00001935 PyObject *m, *api;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001936 m = PyImport_ImportModule("unicodedata");
1937 if (m == NULL)
1938 goto ucnhashError;
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00001939 api = PyObject_GetAttrString(m, "ucnhash_CAPI");
Fredrik Lundhccc74732001-02-18 22:13:49 +00001940 Py_DECREF(m);
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00001941 if (api == NULL)
Fredrik Lundhccc74732001-02-18 22:13:49 +00001942 goto ucnhashError;
Anthony Baxtera6286212006-04-11 07:42:36 +00001943 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCObject_AsVoidPtr(api);
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00001944 Py_DECREF(api);
Fredrik Lundhccc74732001-02-18 22:13:49 +00001945 if (ucnhash_CAPI == NULL)
1946 goto ucnhashError;
1947 }
1948 if (*s == '{') {
1949 const char *start = s+1;
1950 /* look for the closing brace */
1951 while (*s != '}' && s < end)
1952 s++;
1953 if (s > start && s < end && *s == '}') {
1954 /* found a name. look it up in the unicode database */
1955 message = "unknown Unicode character name";
1956 s++;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +00001957 if (ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1), &chr))
Fredrik Lundhccc74732001-02-18 22:13:49 +00001958 goto store;
1959 }
1960 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001961 endinpos = s-starts;
1962 outpos = p-PyUnicode_AS_UNICODE(v);
1963 if (unicode_decode_call_errorhandler(
1964 errors, &errorHandler,
1965 "unicodeescape", message,
1966 starts, size, &startinpos, &endinpos, &exc, &s,
1967 (PyObject **)&v, &outpos, &p))
Fredrik Lundhccc74732001-02-18 22:13:49 +00001968 goto onError;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001969 break;
1970
1971 default:
Walter Dörwald8c077222002-03-25 11:16:18 +00001972 if (s > end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001973 message = "\\ at end of string";
1974 s--;
1975 endinpos = s-starts;
1976 outpos = p-PyUnicode_AS_UNICODE(v);
1977 if (unicode_decode_call_errorhandler(
1978 errors, &errorHandler,
1979 "unicodeescape", message,
1980 starts, size, &startinpos, &endinpos, &exc, &s,
1981 (PyObject **)&v, &outpos, &p))
Walter Dörwald8c077222002-03-25 11:16:18 +00001982 goto onError;
1983 }
1984 else {
1985 *p++ = '\\';
1986 *p++ = (unsigned char)s[-1];
1987 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00001988 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001989 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001990 nextByte:
1991 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001992 }
Martin v. Löwis412fb672006-04-13 06:34:32 +00001993 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001994 goto onError;
Walter Dörwaldd4ade082003-08-15 15:00:26 +00001995 Py_XDECREF(errorHandler);
1996 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001997 return (PyObject *)v;
Walter Dörwald8c077222002-03-25 11:16:18 +00001998
Fredrik Lundhccc74732001-02-18 22:13:49 +00001999ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00002000 PyErr_SetString(
2001 PyExc_UnicodeError,
2002 "\\N escapes not supported (can't load unicodedata module)"
2003 );
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00002004 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002005 Py_XDECREF(errorHandler);
2006 Py_XDECREF(exc);
Fredrik Lundhf6056062001-01-20 11:15:25 +00002007 return NULL;
2008
Fredrik Lundhccc74732001-02-18 22:13:49 +00002009onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00002010 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002011 Py_XDECREF(errorHandler);
2012 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002013 return NULL;
2014}
2015
2016/* Return a Unicode-Escape string version of the Unicode object.
2017
2018 If quotes is true, the string is enclosed in u"" or u'' quotes as
2019 appropriate.
2020
2021*/
2022
Fredrik Lundh347ee272006-05-24 16:35:18 +00002023LOCAL(const Py_UNICODE *) findchar(const Py_UNICODE *s,
2024 Py_ssize_t size,
2025 Py_UNICODE ch)
2026{
2027 /* like wcschr, but doesn't stop at NULL characters */
2028
2029 while (size-- > 0) {
2030 if (*s == ch)
2031 return s;
2032 s++;
2033 }
2034
2035 return NULL;
2036}
Barry Warsaw51ac5802000-03-20 16:36:48 +00002037
Guido van Rossumd57fd912000-03-10 22:53:23 +00002038static
2039PyObject *unicodeescape_string(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002040 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00002041 int quotes)
2042{
2043 PyObject *repr;
2044 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002045
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00002046 static const char *hexdigit = "0123456789abcdef";
Guido van Rossumd57fd912000-03-10 22:53:23 +00002047
2048 repr = PyString_FromStringAndSize(NULL, 2 + 6*size + 1);
2049 if (repr == NULL)
2050 return NULL;
2051
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002052 p = PyString_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002053
2054 if (quotes) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002055 *p++ = 'u';
Tim Petersced69f82003-09-16 20:30:58 +00002056 *p++ = (findchar(s, size, '\'') &&
Guido van Rossumd57fd912000-03-10 22:53:23 +00002057 !findchar(s, size, '"')) ? '"' : '\'';
2058 }
2059 while (size-- > 0) {
2060 Py_UNICODE ch = *s++;
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002061
Hye-Shik Chang835b2432005-12-17 04:38:31 +00002062 /* Escape quotes and backslashes */
2063 if ((quotes &&
2064 ch == (Py_UNICODE) PyString_AS_STRING(repr)[1]) || ch == '\\') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002065 *p++ = '\\';
2066 *p++ = (char) ch;
Guido van Rossumad9744a2001-09-21 15:38:17 +00002067 continue;
Tim Petersced69f82003-09-16 20:30:58 +00002068 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002069
Guido van Rossum0d42e0c2001-07-20 16:36:21 +00002070#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002071 /* Map 21-bit characters to '\U00xxxxxx' */
2072 else if (ch >= 0x10000) {
Martin v. Löwis412fb672006-04-13 06:34:32 +00002073 Py_ssize_t offset = p - PyString_AS_STRING(repr);
Tim Petersced69f82003-09-16 20:30:58 +00002074
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002075 /* Resize the string if necessary */
2076 if (offset + 12 > PyString_GET_SIZE(repr)) {
2077 if (_PyString_Resize(&repr, PyString_GET_SIZE(repr) + 100))
Tim Peters5de98422002-04-27 18:44:32 +00002078 return NULL;
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002079 p = PyString_AS_STRING(repr) + offset;
2080 }
2081
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002082 *p++ = '\\';
2083 *p++ = 'U';
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002084 *p++ = hexdigit[(ch >> 28) & 0x0000000F];
2085 *p++ = hexdigit[(ch >> 24) & 0x0000000F];
2086 *p++ = hexdigit[(ch >> 20) & 0x0000000F];
2087 *p++ = hexdigit[(ch >> 16) & 0x0000000F];
2088 *p++ = hexdigit[(ch >> 12) & 0x0000000F];
2089 *p++ = hexdigit[(ch >> 8) & 0x0000000F];
2090 *p++ = hexdigit[(ch >> 4) & 0x0000000F];
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002091 *p++ = hexdigit[ch & 0x0000000F];
2092 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002093 }
Guido van Rossum0d42e0c2001-07-20 16:36:21 +00002094#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002095 /* Map UTF-16 surrogate pairs to Unicode \UXXXXXXXX escapes */
2096 else if (ch >= 0xD800 && ch < 0xDC00) {
2097 Py_UNICODE ch2;
2098 Py_UCS4 ucs;
Tim Petersced69f82003-09-16 20:30:58 +00002099
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002100 ch2 = *s++;
2101 size--;
2102 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
2103 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
2104 *p++ = '\\';
2105 *p++ = 'U';
2106 *p++ = hexdigit[(ucs >> 28) & 0x0000000F];
2107 *p++ = hexdigit[(ucs >> 24) & 0x0000000F];
2108 *p++ = hexdigit[(ucs >> 20) & 0x0000000F];
2109 *p++ = hexdigit[(ucs >> 16) & 0x0000000F];
2110 *p++ = hexdigit[(ucs >> 12) & 0x0000000F];
2111 *p++ = hexdigit[(ucs >> 8) & 0x0000000F];
2112 *p++ = hexdigit[(ucs >> 4) & 0x0000000F];
2113 *p++ = hexdigit[ucs & 0x0000000F];
2114 continue;
2115 }
2116 /* Fall through: isolated surrogates are copied as-is */
2117 s--;
2118 size++;
2119 }
2120
Guido van Rossumd57fd912000-03-10 22:53:23 +00002121 /* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002122 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002123 *p++ = '\\';
2124 *p++ = 'u';
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002125 *p++ = hexdigit[(ch >> 12) & 0x000F];
2126 *p++ = hexdigit[(ch >> 8) & 0x000F];
2127 *p++ = hexdigit[(ch >> 4) & 0x000F];
2128 *p++ = hexdigit[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00002129 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002130
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00002131 /* Map special whitespace to '\t', \n', '\r' */
2132 else if (ch == '\t') {
2133 *p++ = '\\';
2134 *p++ = 't';
2135 }
2136 else if (ch == '\n') {
2137 *p++ = '\\';
2138 *p++ = 'n';
2139 }
2140 else if (ch == '\r') {
2141 *p++ = '\\';
2142 *p++ = 'r';
2143 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002144
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00002145 /* Map non-printable US ASCII to '\xhh' */
Marc-André Lemburg11326de2001-11-28 12:56:20 +00002146 else if (ch < ' ' || ch >= 0x7F) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002147 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00002148 *p++ = 'x';
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002149 *p++ = hexdigit[(ch >> 4) & 0x000F];
2150 *p++ = hexdigit[ch & 0x000F];
Tim Petersced69f82003-09-16 20:30:58 +00002151 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002152
Guido van Rossumd57fd912000-03-10 22:53:23 +00002153 /* Copy everything else as-is */
2154 else
2155 *p++ = (char) ch;
2156 }
2157 if (quotes)
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002158 *p++ = PyString_AS_STRING(repr)[1];
Guido van Rossumd57fd912000-03-10 22:53:23 +00002159
2160 *p = '\0';
Tim Peters5de98422002-04-27 18:44:32 +00002161 _PyString_Resize(&repr, p - PyString_AS_STRING(repr));
Guido van Rossumd57fd912000-03-10 22:53:23 +00002162 return repr;
2163}
2164
2165PyObject *PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002166 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002167{
2168 return unicodeescape_string(s, size, 0);
2169}
2170
2171PyObject *PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
2172{
2173 if (!PyUnicode_Check(unicode)) {
2174 PyErr_BadArgument();
2175 return NULL;
2176 }
2177 return PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
2178 PyUnicode_GET_SIZE(unicode));
2179}
2180
2181/* --- Raw Unicode Escape Codec ------------------------------------------- */
2182
2183PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002184 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00002185 const char *errors)
2186{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002187 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002188 Py_ssize_t startinpos;
2189 Py_ssize_t endinpos;
2190 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002191 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002192 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002193 const char *end;
2194 const char *bs;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002195 PyObject *errorHandler = NULL;
2196 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00002197
Guido van Rossumd57fd912000-03-10 22:53:23 +00002198 /* Escaped strings will always be longer than the resulting
2199 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002200 length after conversion to the true value. (But decoding error
2201 handler might have to resize the string) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002202 v = _PyUnicode_New(size);
2203 if (v == NULL)
2204 goto onError;
2205 if (size == 0)
2206 return (PyObject *)v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002207 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002208 end = s + size;
2209 while (s < end) {
2210 unsigned char c;
Martin v. Löwis047c05e2002-03-21 08:55:28 +00002211 Py_UCS4 x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002212 int i;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002213 int count;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002214
2215 /* Non-escape characters are interpreted as Unicode ordinals */
2216 if (*s != '\\') {
2217 *p++ = (unsigned char)*s++;
2218 continue;
2219 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002220 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002221
2222 /* \u-escapes are only interpreted iff the number of leading
2223 backslashes if odd */
2224 bs = s;
2225 for (;s < end;) {
2226 if (*s != '\\')
2227 break;
2228 *p++ = (unsigned char)*s++;
2229 }
2230 if (((s - bs) & 1) == 0 ||
2231 s >= end ||
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002232 (*s != 'u' && *s != 'U')) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002233 continue;
2234 }
2235 p--;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002236 count = *s=='u' ? 4 : 8;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002237 s++;
2238
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002239 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002240 outpos = p-PyUnicode_AS_UNICODE(v);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002241 for (x = 0, i = 0; i < count; ++i, ++s) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002242 c = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002243 if (!isxdigit(c)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002244 endinpos = s-starts;
2245 if (unicode_decode_call_errorhandler(
2246 errors, &errorHandler,
2247 "rawunicodeescape", "truncated \\uXXXX",
2248 starts, size, &startinpos, &endinpos, &exc, &s,
2249 (PyObject **)&v, &outpos, &p))
Guido van Rossumd57fd912000-03-10 22:53:23 +00002250 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002251 goto nextByte;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002252 }
2253 x = (x<<4) & ~0xF;
2254 if (c >= '0' && c <= '9')
2255 x += c - '0';
2256 else if (c >= 'a' && c <= 'f')
2257 x += 10 + c - 'a';
2258 else
2259 x += 10 + c - 'A';
2260 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002261#ifndef Py_UNICODE_WIDE
2262 if (x > 0x10000) {
2263 if (unicode_decode_call_errorhandler(
2264 errors, &errorHandler,
2265 "rawunicodeescape", "\\Uxxxxxxxx out of range",
2266 starts, size, &startinpos, &endinpos, &exc, &s,
2267 (PyObject **)&v, &outpos, &p))
2268 goto onError;
2269 }
2270#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002271 *p++ = x;
2272 nextByte:
2273 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002274 }
Martin v. Löwis412fb672006-04-13 06:34:32 +00002275 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002276 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002277 Py_XDECREF(errorHandler);
2278 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002279 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00002280
Guido van Rossumd57fd912000-03-10 22:53:23 +00002281 onError:
2282 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002283 Py_XDECREF(errorHandler);
2284 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002285 return NULL;
2286}
2287
2288PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002289 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002290{
2291 PyObject *repr;
2292 char *p;
2293 char *q;
2294
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00002295 static const char *hexdigit = "0123456789abcdef";
Guido van Rossumd57fd912000-03-10 22:53:23 +00002296
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002297#ifdef Py_UNICODE_WIDE
2298 repr = PyString_FromStringAndSize(NULL, 10 * size);
2299#else
Guido van Rossumd57fd912000-03-10 22:53:23 +00002300 repr = PyString_FromStringAndSize(NULL, 6 * size);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002301#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00002302 if (repr == NULL)
2303 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00002304 if (size == 0)
2305 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002306
2307 p = q = PyString_AS_STRING(repr);
2308 while (size-- > 0) {
2309 Py_UNICODE ch = *s++;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002310#ifdef Py_UNICODE_WIDE
2311 /* Map 32-bit characters to '\Uxxxxxxxx' */
2312 if (ch >= 0x10000) {
2313 *p++ = '\\';
2314 *p++ = 'U';
2315 *p++ = hexdigit[(ch >> 28) & 0xf];
2316 *p++ = hexdigit[(ch >> 24) & 0xf];
2317 *p++ = hexdigit[(ch >> 20) & 0xf];
2318 *p++ = hexdigit[(ch >> 16) & 0xf];
2319 *p++ = hexdigit[(ch >> 12) & 0xf];
2320 *p++ = hexdigit[(ch >> 8) & 0xf];
2321 *p++ = hexdigit[(ch >> 4) & 0xf];
2322 *p++ = hexdigit[ch & 15];
Tim Petersced69f82003-09-16 20:30:58 +00002323 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002324 else
2325#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00002326 /* Map 16-bit characters to '\uxxxx' */
2327 if (ch >= 256) {
2328 *p++ = '\\';
2329 *p++ = 'u';
2330 *p++ = hexdigit[(ch >> 12) & 0xf];
2331 *p++ = hexdigit[(ch >> 8) & 0xf];
2332 *p++ = hexdigit[(ch >> 4) & 0xf];
2333 *p++ = hexdigit[ch & 15];
2334 }
2335 /* Copy everything else as-is */
2336 else
2337 *p++ = (char) ch;
2338 }
2339 *p = '\0';
Tim Peters5de98422002-04-27 18:44:32 +00002340 _PyString_Resize(&repr, p - q);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002341 return repr;
2342}
2343
2344PyObject *PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
2345{
2346 if (!PyUnicode_Check(unicode)) {
2347 PyErr_BadArgument();
2348 return NULL;
2349 }
2350 return PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
2351 PyUnicode_GET_SIZE(unicode));
2352}
2353
Walter Dörwalda47d1c02005-08-30 10:23:14 +00002354/* --- Unicode Internal Codec ------------------------------------------- */
2355
2356PyObject *_PyUnicode_DecodeUnicodeInternal(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002357 Py_ssize_t size,
Walter Dörwalda47d1c02005-08-30 10:23:14 +00002358 const char *errors)
2359{
2360 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002361 Py_ssize_t startinpos;
2362 Py_ssize_t endinpos;
2363 Py_ssize_t outpos;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00002364 PyUnicodeObject *v;
2365 Py_UNICODE *p;
2366 const char *end;
2367 const char *reason;
2368 PyObject *errorHandler = NULL;
2369 PyObject *exc = NULL;
2370
Neal Norwitzd43069c2006-01-08 01:12:10 +00002371#ifdef Py_UNICODE_WIDE
2372 Py_UNICODE unimax = PyUnicode_GetMax();
2373#endif
2374
Walter Dörwalda47d1c02005-08-30 10:23:14 +00002375 v = _PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE);
2376 if (v == NULL)
2377 goto onError;
2378 if (PyUnicode_GetSize((PyObject *)v) == 0)
2379 return (PyObject *)v;
2380 p = PyUnicode_AS_UNICODE(v);
2381 end = s + size;
2382
2383 while (s < end) {
Neal Norwitz1004a532006-05-15 07:17:23 +00002384 memcpy(p, s, sizeof(Py_UNICODE));
Walter Dörwalda47d1c02005-08-30 10:23:14 +00002385 /* We have to sanity check the raw data, otherwise doom looms for
2386 some malformed UCS-4 data. */
2387 if (
2388 #ifdef Py_UNICODE_WIDE
2389 *p > unimax || *p < 0 ||
2390 #endif
2391 end-s < Py_UNICODE_SIZE
2392 )
2393 {
2394 startinpos = s - starts;
2395 if (end-s < Py_UNICODE_SIZE) {
2396 endinpos = end-starts;
2397 reason = "truncated input";
2398 }
2399 else {
2400 endinpos = s - starts + Py_UNICODE_SIZE;
2401 reason = "illegal code point (> 0x10FFFF)";
2402 }
2403 outpos = p - PyUnicode_AS_UNICODE(v);
2404 if (unicode_decode_call_errorhandler(
2405 errors, &errorHandler,
2406 "unicode_internal", reason,
2407 starts, size, &startinpos, &endinpos, &exc, &s,
2408 (PyObject **)&v, &outpos, &p)) {
2409 goto onError;
2410 }
2411 }
2412 else {
2413 p++;
2414 s += Py_UNICODE_SIZE;
2415 }
2416 }
2417
Martin v. Löwis412fb672006-04-13 06:34:32 +00002418 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00002419 goto onError;
2420 Py_XDECREF(errorHandler);
2421 Py_XDECREF(exc);
2422 return (PyObject *)v;
2423
2424 onError:
2425 Py_XDECREF(v);
2426 Py_XDECREF(errorHandler);
2427 Py_XDECREF(exc);
2428 return NULL;
2429}
2430
Guido van Rossumd57fd912000-03-10 22:53:23 +00002431/* --- Latin-1 Codec ------------------------------------------------------ */
2432
2433PyObject *PyUnicode_DecodeLatin1(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002434 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00002435 const char *errors)
2436{
2437 PyUnicodeObject *v;
2438 Py_UNICODE *p;
Tim Petersced69f82003-09-16 20:30:58 +00002439
Guido van Rossumd57fd912000-03-10 22:53:23 +00002440 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002441 if (size == 1) {
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002442 Py_UNICODE r = *(unsigned char*)s;
2443 return PyUnicode_FromUnicode(&r, 1);
2444 }
2445
Guido van Rossumd57fd912000-03-10 22:53:23 +00002446 v = _PyUnicode_New(size);
2447 if (v == NULL)
2448 goto onError;
2449 if (size == 0)
2450 return (PyObject *)v;
2451 p = PyUnicode_AS_UNICODE(v);
2452 while (size-- > 0)
2453 *p++ = (unsigned char)*s++;
2454 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00002455
Guido van Rossumd57fd912000-03-10 22:53:23 +00002456 onError:
2457 Py_XDECREF(v);
2458 return NULL;
2459}
2460
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002461/* create or adjust a UnicodeEncodeError */
2462static void make_encode_exception(PyObject **exceptionObject,
2463 const char *encoding,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002464 const Py_UNICODE *unicode, Py_ssize_t size,
2465 Py_ssize_t startpos, Py_ssize_t endpos,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002466 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002467{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002468 if (*exceptionObject == NULL) {
2469 *exceptionObject = PyUnicodeEncodeError_Create(
2470 encoding, unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002471 }
2472 else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002473 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
2474 goto onError;
2475 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
2476 goto onError;
2477 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
2478 goto onError;
2479 return;
2480 onError:
2481 Py_DECREF(*exceptionObject);
2482 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002483 }
2484}
2485
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002486/* raises a UnicodeEncodeError */
2487static void raise_encode_exception(PyObject **exceptionObject,
2488 const char *encoding,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002489 const Py_UNICODE *unicode, Py_ssize_t size,
2490 Py_ssize_t startpos, Py_ssize_t endpos,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002491 const char *reason)
2492{
2493 make_encode_exception(exceptionObject,
2494 encoding, unicode, size, startpos, endpos, reason);
2495 if (*exceptionObject != NULL)
2496 PyCodec_StrictErrors(*exceptionObject);
2497}
2498
2499/* error handling callback helper:
2500 build arguments, call the callback and check the arguments,
2501 put the result into newpos and return the replacement string, which
2502 has to be freed by the caller */
2503static PyObject *unicode_encode_call_errorhandler(const char *errors,
2504 PyObject **errorHandler,
2505 const char *encoding, const char *reason,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002506 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
2507 Py_ssize_t startpos, Py_ssize_t endpos,
2508 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002509{
Martin v. Löwis18e16552006-02-15 17:27:45 +00002510 static char *argparse = "O!n;encoding error handler must return (unicode, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002511
2512 PyObject *restuple;
2513 PyObject *resunicode;
2514
2515 if (*errorHandler == NULL) {
2516 *errorHandler = PyCodec_LookupError(errors);
2517 if (*errorHandler == NULL)
2518 return NULL;
2519 }
2520
2521 make_encode_exception(exceptionObject,
2522 encoding, unicode, size, startpos, endpos, reason);
2523 if (*exceptionObject == NULL)
2524 return NULL;
2525
2526 restuple = PyObject_CallFunctionObjArgs(
2527 *errorHandler, *exceptionObject, NULL);
2528 if (restuple == NULL)
2529 return NULL;
2530 if (!PyTuple_Check(restuple)) {
2531 PyErr_Format(PyExc_TypeError, &argparse[4]);
2532 Py_DECREF(restuple);
2533 return NULL;
2534 }
2535 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
2536 &resunicode, newpos)) {
2537 Py_DECREF(restuple);
2538 return NULL;
2539 }
2540 if (*newpos<0)
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00002541 *newpos = size+*newpos;
2542 if (*newpos<0 || *newpos>size) {
Martin v. Löwis2c95cc62006-02-16 06:54:25 +00002543 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00002544 Py_DECREF(restuple);
2545 return NULL;
2546 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002547 Py_INCREF(resunicode);
2548 Py_DECREF(restuple);
2549 return resunicode;
2550}
2551
2552static PyObject *unicode_encode_ucs1(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002553 Py_ssize_t size,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002554 const char *errors,
2555 int limit)
2556{
2557 /* output object */
2558 PyObject *res;
2559 /* pointers to the beginning and end+1 of input */
2560 const Py_UNICODE *startp = p;
2561 const Py_UNICODE *endp = p + size;
2562 /* pointer to the beginning of the unencodable characters */
2563 /* const Py_UNICODE *badp = NULL; */
2564 /* pointer into the output */
2565 char *str;
2566 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00002567 Py_ssize_t respos = 0;
2568 Py_ssize_t ressize;
Anthony Baxtera6286212006-04-11 07:42:36 +00002569 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
2570 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002571 PyObject *errorHandler = NULL;
2572 PyObject *exc = NULL;
2573 /* the following variable is used for caching string comparisons
2574 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
2575 int known_errorHandler = -1;
2576
2577 /* allocate enough for a simple encoding without
2578 replacements, if we need more, we'll resize */
2579 res = PyString_FromStringAndSize(NULL, size);
2580 if (res == NULL)
2581 goto onError;
2582 if (size == 0)
2583 return res;
2584 str = PyString_AS_STRING(res);
2585 ressize = size;
2586
2587 while (p<endp) {
2588 Py_UNICODE c = *p;
2589
2590 /* can we encode this? */
2591 if (c<limit) {
2592 /* no overflow check, because we know that the space is enough */
2593 *str++ = (char)c;
2594 ++p;
2595 }
2596 else {
Martin v. Löwis18e16552006-02-15 17:27:45 +00002597 Py_ssize_t unicodepos = p-startp;
2598 Py_ssize_t requiredsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002599 PyObject *repunicode;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002600 Py_ssize_t repsize;
2601 Py_ssize_t newpos;
2602 Py_ssize_t respos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002603 Py_UNICODE *uni2;
2604 /* startpos for collecting unencodable chars */
2605 const Py_UNICODE *collstart = p;
2606 const Py_UNICODE *collend = p;
2607 /* find all unecodable characters */
2608 while ((collend < endp) && ((*collend)>=limit))
2609 ++collend;
2610 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
2611 if (known_errorHandler==-1) {
2612 if ((errors==NULL) || (!strcmp(errors, "strict")))
2613 known_errorHandler = 1;
2614 else if (!strcmp(errors, "replace"))
2615 known_errorHandler = 2;
2616 else if (!strcmp(errors, "ignore"))
2617 known_errorHandler = 3;
2618 else if (!strcmp(errors, "xmlcharrefreplace"))
2619 known_errorHandler = 4;
2620 else
2621 known_errorHandler = 0;
2622 }
2623 switch (known_errorHandler) {
2624 case 1: /* strict */
2625 raise_encode_exception(&exc, encoding, startp, size, collstart-startp, collend-startp, reason);
2626 goto onError;
2627 case 2: /* replace */
2628 while (collstart++<collend)
2629 *str++ = '?'; /* fall through */
2630 case 3: /* ignore */
2631 p = collend;
2632 break;
2633 case 4: /* xmlcharrefreplace */
2634 respos = str-PyString_AS_STRING(res);
2635 /* determine replacement size (temporarily (mis)uses p) */
2636 for (p = collstart, repsize = 0; p < collend; ++p) {
2637 if (*p<10)
2638 repsize += 2+1+1;
2639 else if (*p<100)
2640 repsize += 2+2+1;
2641 else if (*p<1000)
2642 repsize += 2+3+1;
2643 else if (*p<10000)
2644 repsize += 2+4+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00002645#ifndef Py_UNICODE_WIDE
2646 else
2647 repsize += 2+5+1;
2648#else
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002649 else if (*p<100000)
2650 repsize += 2+5+1;
2651 else if (*p<1000000)
2652 repsize += 2+6+1;
2653 else
2654 repsize += 2+7+1;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002655#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002656 }
2657 requiredsize = respos+repsize+(endp-collend);
2658 if (requiredsize > ressize) {
2659 if (requiredsize<2*ressize)
2660 requiredsize = 2*ressize;
2661 if (_PyString_Resize(&res, requiredsize))
2662 goto onError;
2663 str = PyString_AS_STRING(res) + respos;
2664 ressize = requiredsize;
2665 }
2666 /* generate replacement (temporarily (mis)uses p) */
2667 for (p = collstart; p < collend; ++p) {
2668 str += sprintf(str, "&#%d;", (int)*p);
2669 }
2670 p = collend;
2671 break;
2672 default:
2673 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
2674 encoding, reason, startp, size, &exc,
2675 collstart-startp, collend-startp, &newpos);
2676 if (repunicode == NULL)
2677 goto onError;
2678 /* need more space? (at least enough for what we
2679 have+the replacement+the rest of the string, so
2680 we won't have to check space for encodable characters) */
2681 respos = str-PyString_AS_STRING(res);
2682 repsize = PyUnicode_GET_SIZE(repunicode);
2683 requiredsize = respos+repsize+(endp-collend);
2684 if (requiredsize > ressize) {
2685 if (requiredsize<2*ressize)
2686 requiredsize = 2*ressize;
2687 if (_PyString_Resize(&res, requiredsize)) {
2688 Py_DECREF(repunicode);
2689 goto onError;
2690 }
2691 str = PyString_AS_STRING(res) + respos;
2692 ressize = requiredsize;
2693 }
2694 /* check if there is anything unencodable in the replacement
2695 and copy it to the output */
2696 for (uni2 = PyUnicode_AS_UNICODE(repunicode);repsize-->0; ++uni2, ++str) {
2697 c = *uni2;
2698 if (c >= limit) {
2699 raise_encode_exception(&exc, encoding, startp, size,
2700 unicodepos, unicodepos+1, reason);
2701 Py_DECREF(repunicode);
2702 goto onError;
2703 }
2704 *str = (char)c;
2705 }
2706 p = startp + newpos;
2707 Py_DECREF(repunicode);
2708 }
2709 }
2710 }
2711 /* Resize if we allocated to much */
2712 respos = str-PyString_AS_STRING(res);
2713 if (respos<ressize)
2714 /* If this falls res will be NULL */
2715 _PyString_Resize(&res, respos);
2716 Py_XDECREF(errorHandler);
2717 Py_XDECREF(exc);
2718 return res;
2719
2720 onError:
2721 Py_XDECREF(res);
2722 Py_XDECREF(errorHandler);
2723 Py_XDECREF(exc);
2724 return NULL;
2725}
2726
Guido van Rossumd57fd912000-03-10 22:53:23 +00002727PyObject *PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002728 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00002729 const char *errors)
2730{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002731 return unicode_encode_ucs1(p, size, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002732}
2733
2734PyObject *PyUnicode_AsLatin1String(PyObject *unicode)
2735{
2736 if (!PyUnicode_Check(unicode)) {
2737 PyErr_BadArgument();
2738 return NULL;
2739 }
2740 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
2741 PyUnicode_GET_SIZE(unicode),
2742 NULL);
2743}
2744
2745/* --- 7-bit ASCII Codec -------------------------------------------------- */
2746
Guido van Rossumd57fd912000-03-10 22:53:23 +00002747PyObject *PyUnicode_DecodeASCII(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002748 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00002749 const char *errors)
2750{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002751 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002752 PyUnicodeObject *v;
2753 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002754 Py_ssize_t startinpos;
2755 Py_ssize_t endinpos;
2756 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002757 const char *e;
2758 PyObject *errorHandler = NULL;
2759 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00002760
Guido van Rossumd57fd912000-03-10 22:53:23 +00002761 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002762 if (size == 1 && *(unsigned char*)s < 128) {
2763 Py_UNICODE r = *(unsigned char*)s;
2764 return PyUnicode_FromUnicode(&r, 1);
2765 }
Tim Petersced69f82003-09-16 20:30:58 +00002766
Guido van Rossumd57fd912000-03-10 22:53:23 +00002767 v = _PyUnicode_New(size);
2768 if (v == NULL)
2769 goto onError;
2770 if (size == 0)
2771 return (PyObject *)v;
2772 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002773 e = s + size;
2774 while (s < e) {
2775 register unsigned char c = (unsigned char)*s;
2776 if (c < 128) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002777 *p++ = c;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002778 ++s;
2779 }
2780 else {
2781 startinpos = s-starts;
2782 endinpos = startinpos + 1;
Jeremy Hyltond8082792003-09-16 19:41:39 +00002783 outpos = p - (Py_UNICODE *)PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002784 if (unicode_decode_call_errorhandler(
2785 errors, &errorHandler,
2786 "ascii", "ordinal not in range(128)",
2787 starts, size, &startinpos, &endinpos, &exc, &s,
2788 (PyObject **)&v, &outpos, &p))
Guido van Rossumd57fd912000-03-10 22:53:23 +00002789 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002790 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002791 }
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002792 if (p - PyUnicode_AS_UNICODE(v) < PyString_GET_SIZE(v))
Martin v. Löwis412fb672006-04-13 06:34:32 +00002793 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002794 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002795 Py_XDECREF(errorHandler);
2796 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002797 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00002798
Guido van Rossumd57fd912000-03-10 22:53:23 +00002799 onError:
2800 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002801 Py_XDECREF(errorHandler);
2802 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002803 return NULL;
2804}
2805
Guido van Rossumd57fd912000-03-10 22:53:23 +00002806PyObject *PyUnicode_EncodeASCII(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002807 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00002808 const char *errors)
2809{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002810 return unicode_encode_ucs1(p, size, errors, 128);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002811}
2812
2813PyObject *PyUnicode_AsASCIIString(PyObject *unicode)
2814{
2815 if (!PyUnicode_Check(unicode)) {
2816 PyErr_BadArgument();
2817 return NULL;
2818 }
2819 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
2820 PyUnicode_GET_SIZE(unicode),
2821 NULL);
2822}
2823
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00002824#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Guido van Rossum2ea3e142000-03-31 17:24:09 +00002825
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002826/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00002827
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002828PyObject *PyUnicode_DecodeMBCS(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002829 Py_ssize_t size,
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002830 const char *errors)
2831{
2832 PyUnicodeObject *v;
2833 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002834 DWORD usize;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002835
2836 /* First get the size of the result */
Martin v. Löwis18e16552006-02-15 17:27:45 +00002837 assert(size < INT_MAX);
2838 usize = MultiByteToWideChar(CP_ACP, 0, s, (int)size, NULL, 0);
Guido van Rossum03e29f12000-05-04 15:52:20 +00002839 if (size > 0 && usize==0)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002840 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
2841
2842 v = _PyUnicode_New(usize);
2843 if (v == NULL)
2844 return NULL;
2845 if (usize == 0)
2846 return (PyObject *)v;
2847 p = PyUnicode_AS_UNICODE(v);
Martin v. Löwis18e16552006-02-15 17:27:45 +00002848 if (0 == MultiByteToWideChar(CP_ACP, 0, s, (int)size, p, usize)) {
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002849 Py_DECREF(v);
2850 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
2851 }
2852
2853 return (PyObject *)v;
2854}
2855
2856PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002857 Py_ssize_t size,
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002858 const char *errors)
2859{
2860 PyObject *repr;
2861 char *s;
Guido van Rossum03e29f12000-05-04 15:52:20 +00002862 DWORD mbcssize;
2863
2864 /* If there are no characters, bail now! */
2865 if (size==0)
2866 return PyString_FromString("");
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002867
2868 /* First get the size of the result */
Martin v. Löwis18e16552006-02-15 17:27:45 +00002869 assert(size<INT_MAX);
2870 mbcssize = WideCharToMultiByte(CP_ACP, 0, p, (int)size, NULL, 0, NULL, NULL);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002871 if (mbcssize==0)
2872 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
2873
2874 repr = PyString_FromStringAndSize(NULL, mbcssize);
2875 if (repr == NULL)
2876 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00002877 if (mbcssize == 0)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002878 return repr;
2879
2880 /* Do the conversion */
2881 s = PyString_AS_STRING(repr);
Martin v. Löwis18e16552006-02-15 17:27:45 +00002882 assert(size < INT_MAX);
2883 if (0 == WideCharToMultiByte(CP_ACP, 0, p, (int)size, s, mbcssize, NULL, NULL)) {
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002884 Py_DECREF(repr);
2885 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
2886 }
2887 return repr;
2888}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00002889
Mark Hammond0ccda1e2003-07-01 00:13:27 +00002890PyObject *PyUnicode_AsMBCSString(PyObject *unicode)
2891{
2892 if (!PyUnicode_Check(unicode)) {
2893 PyErr_BadArgument();
2894 return NULL;
2895 }
2896 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
2897 PyUnicode_GET_SIZE(unicode),
2898 NULL);
2899}
2900
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00002901#endif /* MS_WINDOWS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002902
Guido van Rossumd57fd912000-03-10 22:53:23 +00002903/* --- Character Mapping Codec -------------------------------------------- */
2904
Guido van Rossumd57fd912000-03-10 22:53:23 +00002905PyObject *PyUnicode_DecodeCharmap(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002906 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00002907 PyObject *mapping,
2908 const char *errors)
2909{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002910 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002911 Py_ssize_t startinpos;
2912 Py_ssize_t endinpos;
2913 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002914 const char *e;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002915 PyUnicodeObject *v;
2916 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002917 Py_ssize_t extrachars = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002918 PyObject *errorHandler = NULL;
2919 PyObject *exc = NULL;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00002920 Py_UNICODE *mapstring = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002921 Py_ssize_t maplen = 0;
Tim Petersced69f82003-09-16 20:30:58 +00002922
Guido van Rossumd57fd912000-03-10 22:53:23 +00002923 /* Default to Latin-1 */
2924 if (mapping == NULL)
2925 return PyUnicode_DecodeLatin1(s, size, errors);
2926
2927 v = _PyUnicode_New(size);
2928 if (v == NULL)
2929 goto onError;
2930 if (size == 0)
2931 return (PyObject *)v;
2932 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002933 e = s + size;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00002934 if (PyUnicode_CheckExact(mapping)) {
2935 mapstring = PyUnicode_AS_UNICODE(mapping);
2936 maplen = PyUnicode_GET_SIZE(mapping);
2937 while (s < e) {
2938 unsigned char ch = *s;
2939 Py_UNICODE x = 0xfffe; /* illegal value */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002940
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00002941 if (ch < maplen)
2942 x = mapstring[ch];
Guido van Rossumd57fd912000-03-10 22:53:23 +00002943
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00002944 if (x == 0xfffe) {
2945 /* undefined mapping */
2946 outpos = p-PyUnicode_AS_UNICODE(v);
2947 startinpos = s-starts;
2948 endinpos = startinpos+1;
2949 if (unicode_decode_call_errorhandler(
2950 errors, &errorHandler,
2951 "charmap", "character maps to <undefined>",
2952 starts, size, &startinpos, &endinpos, &exc, &s,
2953 (PyObject **)&v, &outpos, &p)) {
2954 goto onError;
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002955 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00002956 continue;
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002957 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00002958 *p++ = x;
2959 ++s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002960 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00002961 }
2962 else {
2963 while (s < e) {
2964 unsigned char ch = *s;
2965 PyObject *w, *x;
2966
2967 /* Get mapping (char ordinal -> integer, Unicode char or None) */
2968 w = PyInt_FromLong((long)ch);
2969 if (w == NULL)
2970 goto onError;
2971 x = PyObject_GetItem(mapping, w);
2972 Py_DECREF(w);
2973 if (x == NULL) {
2974 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
2975 /* No mapping found means: mapping is undefined. */
2976 PyErr_Clear();
2977 x = Py_None;
2978 Py_INCREF(x);
2979 } else
2980 goto onError;
2981 }
2982
2983 /* Apply mapping */
2984 if (PyInt_Check(x)) {
2985 long value = PyInt_AS_LONG(x);
2986 if (value < 0 || value > 65535) {
2987 PyErr_SetString(PyExc_TypeError,
2988 "character mapping must be in range(65536)");
2989 Py_DECREF(x);
2990 goto onError;
2991 }
2992 *p++ = (Py_UNICODE)value;
2993 }
2994 else if (x == Py_None) {
2995 /* undefined mapping */
2996 outpos = p-PyUnicode_AS_UNICODE(v);
2997 startinpos = s-starts;
2998 endinpos = startinpos+1;
2999 if (unicode_decode_call_errorhandler(
3000 errors, &errorHandler,
3001 "charmap", "character maps to <undefined>",
3002 starts, size, &startinpos, &endinpos, &exc, &s,
3003 (PyObject **)&v, &outpos, &p)) {
3004 Py_DECREF(x);
3005 goto onError;
3006 }
Walter Dörwaldd4fff172005-11-28 22:15:56 +00003007 Py_DECREF(x);
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003008 continue;
3009 }
3010 else if (PyUnicode_Check(x)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00003011 Py_ssize_t targetsize = PyUnicode_GET_SIZE(x);
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003012
3013 if (targetsize == 1)
3014 /* 1-1 mapping */
3015 *p++ = *PyUnicode_AS_UNICODE(x);
3016
3017 else if (targetsize > 1) {
3018 /* 1-n mapping */
3019 if (targetsize > extrachars) {
3020 /* resize first */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003021 Py_ssize_t oldpos = p - PyUnicode_AS_UNICODE(v);
3022 Py_ssize_t needed = (targetsize - extrachars) + \
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003023 (targetsize << 2);
3024 extrachars += needed;
3025 if (_PyUnicode_Resize(&v,
3026 PyUnicode_GET_SIZE(v) + needed) < 0) {
3027 Py_DECREF(x);
3028 goto onError;
3029 }
3030 p = PyUnicode_AS_UNICODE(v) + oldpos;
3031 }
3032 Py_UNICODE_COPY(p,
3033 PyUnicode_AS_UNICODE(x),
3034 targetsize);
3035 p += targetsize;
3036 extrachars -= targetsize;
3037 }
3038 /* 1-0 mapping: skip the character */
3039 }
3040 else {
3041 /* wrong return value */
3042 PyErr_SetString(PyExc_TypeError,
3043 "character mapping must return integer, None or unicode");
3044 Py_DECREF(x);
3045 goto onError;
3046 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003047 Py_DECREF(x);
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003048 ++s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003049 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003050 }
3051 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Martin v. Löwis412fb672006-04-13 06:34:32 +00003052 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003053 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003054 Py_XDECREF(errorHandler);
3055 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003056 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00003057
Guido van Rossumd57fd912000-03-10 22:53:23 +00003058 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003059 Py_XDECREF(errorHandler);
3060 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003061 Py_XDECREF(v);
3062 return NULL;
3063}
3064
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003065/* Lookup the character ch in the mapping. If the character
3066 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00003067 error occurred). */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003068static PyObject *charmapencode_lookup(Py_UNICODE c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003069{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003070 PyObject *w = PyInt_FromLong((long)c);
3071 PyObject *x;
3072
3073 if (w == NULL)
3074 return NULL;
3075 x = PyObject_GetItem(mapping, w);
3076 Py_DECREF(w);
3077 if (x == NULL) {
3078 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
3079 /* No mapping found means: mapping is undefined. */
3080 PyErr_Clear();
3081 x = Py_None;
3082 Py_INCREF(x);
3083 return x;
3084 } else
3085 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003086 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00003087 else if (x == Py_None)
3088 return x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003089 else if (PyInt_Check(x)) {
3090 long value = PyInt_AS_LONG(x);
3091 if (value < 0 || value > 255) {
3092 PyErr_SetString(PyExc_TypeError,
3093 "character mapping must be in range(256)");
3094 Py_DECREF(x);
3095 return NULL;
3096 }
3097 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003098 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003099 else if (PyString_Check(x))
3100 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003101 else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003102 /* wrong return value */
3103 PyErr_SetString(PyExc_TypeError,
3104 "character mapping must return integer, None or str");
3105 Py_DECREF(x);
3106 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003107 }
3108}
3109
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003110/* lookup the character, put the result in the output string and adjust
3111 various state variables. Reallocate the output string if not enough
3112 space is available. Return a new reference to the object that
3113 was put in the output buffer, or Py_None, if the mapping was undefined
3114 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00003115 reallocation error occurred. The caller must decref the result */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003116static
3117PyObject *charmapencode_output(Py_UNICODE c, PyObject *mapping,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003118 PyObject **outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003119{
3120 PyObject *rep = charmapencode_lookup(c, mapping);
3121
3122 if (rep==NULL)
3123 return NULL;
3124 else if (rep==Py_None)
3125 return rep;
3126 else {
3127 char *outstart = PyString_AS_STRING(*outobj);
Martin v. Löwis18e16552006-02-15 17:27:45 +00003128 Py_ssize_t outsize = PyString_GET_SIZE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003129 if (PyInt_Check(rep)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00003130 Py_ssize_t requiredsize = *outpos+1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003131 if (outsize<requiredsize) {
3132 /* exponentially overallocate to minimize reallocations */
3133 if (requiredsize < 2*outsize)
3134 requiredsize = 2*outsize;
3135 if (_PyString_Resize(outobj, requiredsize)) {
3136 Py_DECREF(rep);
3137 return NULL;
3138 }
3139 outstart = PyString_AS_STRING(*outobj);
3140 }
3141 outstart[(*outpos)++] = (char)PyInt_AS_LONG(rep);
3142 }
3143 else {
3144 const char *repchars = PyString_AS_STRING(rep);
Martin v. Löwis18e16552006-02-15 17:27:45 +00003145 Py_ssize_t repsize = PyString_GET_SIZE(rep);
3146 Py_ssize_t requiredsize = *outpos+repsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003147 if (outsize<requiredsize) {
3148 /* exponentially overallocate to minimize reallocations */
3149 if (requiredsize < 2*outsize)
3150 requiredsize = 2*outsize;
3151 if (_PyString_Resize(outobj, requiredsize)) {
3152 Py_DECREF(rep);
3153 return NULL;
3154 }
3155 outstart = PyString_AS_STRING(*outobj);
3156 }
3157 memcpy(outstart + *outpos, repchars, repsize);
3158 *outpos += repsize;
3159 }
3160 }
3161 return rep;
3162}
3163
3164/* handle an error in PyUnicode_EncodeCharmap
3165 Return 0 on success, -1 on error */
3166static
3167int charmap_encoding_error(
Martin v. Löwis18e16552006-02-15 17:27:45 +00003168 const Py_UNICODE *p, Py_ssize_t size, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003169 PyObject **exceptionObject,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00003170 int *known_errorHandler, PyObject **errorHandler, const char *errors,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003171 PyObject **res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003172{
3173 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003174 Py_ssize_t repsize;
3175 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003176 Py_UNICODE *uni2;
3177 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003178 Py_ssize_t collstartpos = *inpos;
3179 Py_ssize_t collendpos = *inpos+1;
3180 Py_ssize_t collpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003181 char *encoding = "charmap";
3182 char *reason = "character maps to <undefined>";
3183
3184 PyObject *x;
3185 /* find all unencodable characters */
3186 while (collendpos < size) {
3187 x = charmapencode_lookup(p[collendpos], mapping);
3188 if (x==NULL)
3189 return -1;
3190 else if (x!=Py_None) {
3191 Py_DECREF(x);
3192 break;
3193 }
3194 Py_DECREF(x);
3195 ++collendpos;
3196 }
3197 /* cache callback name lookup
3198 * (if not done yet, i.e. it's the first error) */
3199 if (*known_errorHandler==-1) {
3200 if ((errors==NULL) || (!strcmp(errors, "strict")))
3201 *known_errorHandler = 1;
3202 else if (!strcmp(errors, "replace"))
3203 *known_errorHandler = 2;
3204 else if (!strcmp(errors, "ignore"))
3205 *known_errorHandler = 3;
3206 else if (!strcmp(errors, "xmlcharrefreplace"))
3207 *known_errorHandler = 4;
3208 else
3209 *known_errorHandler = 0;
3210 }
3211 switch (*known_errorHandler) {
3212 case 1: /* strict */
3213 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
3214 return -1;
3215 case 2: /* replace */
3216 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
3217 x = charmapencode_output('?', mapping, res, respos);
3218 if (x==NULL) {
3219 return -1;
3220 }
3221 else if (x==Py_None) {
3222 Py_DECREF(x);
3223 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
3224 return -1;
3225 }
3226 Py_DECREF(x);
3227 }
3228 /* fall through */
3229 case 3: /* ignore */
3230 *inpos = collendpos;
3231 break;
3232 case 4: /* xmlcharrefreplace */
3233 /* generate replacement (temporarily (mis)uses p) */
3234 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
3235 char buffer[2+29+1+1];
3236 char *cp;
3237 sprintf(buffer, "&#%d;", (int)p[collpos]);
3238 for (cp = buffer; *cp; ++cp) {
3239 x = charmapencode_output(*cp, mapping, res, respos);
3240 if (x==NULL)
3241 return -1;
3242 else if (x==Py_None) {
3243 Py_DECREF(x);
3244 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
3245 return -1;
3246 }
3247 Py_DECREF(x);
3248 }
3249 }
3250 *inpos = collendpos;
3251 break;
3252 default:
Walter Dörwalde5402fb2003-08-14 20:25:29 +00003253 repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003254 encoding, reason, p, size, exceptionObject,
3255 collstartpos, collendpos, &newpos);
3256 if (repunicode == NULL)
3257 return -1;
3258 /* generate replacement */
3259 repsize = PyUnicode_GET_SIZE(repunicode);
3260 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
3261 x = charmapencode_output(*uni2, mapping, res, respos);
3262 if (x==NULL) {
3263 Py_DECREF(repunicode);
3264 return -1;
3265 }
3266 else if (x==Py_None) {
3267 Py_DECREF(repunicode);
3268 Py_DECREF(x);
3269 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
3270 return -1;
3271 }
3272 Py_DECREF(x);
3273 }
3274 *inpos = newpos;
3275 Py_DECREF(repunicode);
3276 }
3277 return 0;
3278}
3279
Guido van Rossumd57fd912000-03-10 22:53:23 +00003280PyObject *PyUnicode_EncodeCharmap(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003281 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003282 PyObject *mapping,
3283 const char *errors)
3284{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003285 /* output object */
3286 PyObject *res = NULL;
3287 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003288 Py_ssize_t inpos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003289 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003290 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003291 PyObject *errorHandler = NULL;
3292 PyObject *exc = NULL;
3293 /* the following variable is used for caching string comparisons
3294 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
3295 * 3=ignore, 4=xmlcharrefreplace */
3296 int known_errorHandler = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003297
3298 /* Default to Latin-1 */
3299 if (mapping == NULL)
3300 return PyUnicode_EncodeLatin1(p, size, errors);
3301
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003302 /* allocate enough for a simple encoding without
3303 replacements, if we need more, we'll resize */
3304 res = PyString_FromStringAndSize(NULL, size);
3305 if (res == NULL)
3306 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00003307 if (size == 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003308 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003309
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003310 while (inpos<size) {
3311 /* try to encode it */
3312 PyObject *x = charmapencode_output(p[inpos], mapping, &res, &respos);
3313 if (x==NULL) /* error */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003314 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003315 if (x==Py_None) { /* unencodable character */
3316 if (charmap_encoding_error(p, size, &inpos, mapping,
3317 &exc,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00003318 &known_errorHandler, &errorHandler, errors,
Walter Dörwald9b30f202003-08-15 16:26:34 +00003319 &res, &respos)) {
3320 Py_DECREF(x);
Marc-André Lemburg3a645e42001-01-16 11:54:12 +00003321 goto onError;
Walter Dörwald9b30f202003-08-15 16:26:34 +00003322 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003323 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003324 else
3325 /* done with this character => adjust input position */
3326 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003327 Py_DECREF(x);
3328 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003329
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003330 /* Resize if we allocated to much */
3331 if (respos<PyString_GET_SIZE(res)) {
3332 if (_PyString_Resize(&res, respos))
3333 goto onError;
3334 }
3335 Py_XDECREF(exc);
3336 Py_XDECREF(errorHandler);
3337 return res;
3338
3339 onError:
3340 Py_XDECREF(res);
3341 Py_XDECREF(exc);
3342 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003343 return NULL;
3344}
3345
3346PyObject *PyUnicode_AsCharmapString(PyObject *unicode,
3347 PyObject *mapping)
3348{
3349 if (!PyUnicode_Check(unicode) || mapping == NULL) {
3350 PyErr_BadArgument();
3351 return NULL;
3352 }
3353 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
3354 PyUnicode_GET_SIZE(unicode),
3355 mapping,
3356 NULL);
3357}
3358
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003359/* create or adjust a UnicodeTranslateError */
3360static void make_translate_exception(PyObject **exceptionObject,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003361 const Py_UNICODE *unicode, Py_ssize_t size,
3362 Py_ssize_t startpos, Py_ssize_t endpos,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003363 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003364{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003365 if (*exceptionObject == NULL) {
3366 *exceptionObject = PyUnicodeTranslateError_Create(
3367 unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003368 }
3369 else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003370 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
3371 goto onError;
3372 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
3373 goto onError;
3374 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
3375 goto onError;
3376 return;
3377 onError:
3378 Py_DECREF(*exceptionObject);
3379 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003380 }
3381}
3382
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003383/* raises a UnicodeTranslateError */
3384static void raise_translate_exception(PyObject **exceptionObject,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003385 const Py_UNICODE *unicode, Py_ssize_t size,
3386 Py_ssize_t startpos, Py_ssize_t endpos,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003387 const char *reason)
3388{
3389 make_translate_exception(exceptionObject,
3390 unicode, size, startpos, endpos, reason);
3391 if (*exceptionObject != NULL)
3392 PyCodec_StrictErrors(*exceptionObject);
3393}
3394
3395/* error handling callback helper:
3396 build arguments, call the callback and check the arguments,
3397 put the result into newpos and return the replacement string, which
3398 has to be freed by the caller */
3399static PyObject *unicode_translate_call_errorhandler(const char *errors,
3400 PyObject **errorHandler,
3401 const char *reason,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003402 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
3403 Py_ssize_t startpos, Py_ssize_t endpos,
3404 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003405{
Martin v. Löwis412fb672006-04-13 06:34:32 +00003406 static char *argparse = "O!n;translating error handler must return (unicode, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003407
Martin v. Löwis412fb672006-04-13 06:34:32 +00003408 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003409 PyObject *restuple;
3410 PyObject *resunicode;
3411
3412 if (*errorHandler == NULL) {
3413 *errorHandler = PyCodec_LookupError(errors);
3414 if (*errorHandler == NULL)
3415 return NULL;
3416 }
3417
3418 make_translate_exception(exceptionObject,
3419 unicode, size, startpos, endpos, reason);
3420 if (*exceptionObject == NULL)
3421 return NULL;
3422
3423 restuple = PyObject_CallFunctionObjArgs(
3424 *errorHandler, *exceptionObject, NULL);
3425 if (restuple == NULL)
3426 return NULL;
3427 if (!PyTuple_Check(restuple)) {
3428 PyErr_Format(PyExc_TypeError, &argparse[4]);
3429 Py_DECREF(restuple);
3430 return NULL;
3431 }
3432 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003433 &resunicode, &i_newpos)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003434 Py_DECREF(restuple);
3435 return NULL;
3436 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00003437 if (i_newpos<0)
3438 *newpos = size+i_newpos;
3439 else
3440 *newpos = i_newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003441 if (*newpos<0 || *newpos>size) {
Martin v. Löwis2c95cc62006-02-16 06:54:25 +00003442 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003443 Py_DECREF(restuple);
3444 return NULL;
3445 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003446 Py_INCREF(resunicode);
3447 Py_DECREF(restuple);
3448 return resunicode;
3449}
3450
3451/* Lookup the character ch in the mapping and put the result in result,
3452 which must be decrefed by the caller.
3453 Return 0 on success, -1 on error */
3454static
3455int charmaptranslate_lookup(Py_UNICODE c, PyObject *mapping, PyObject **result)
3456{
3457 PyObject *w = PyInt_FromLong((long)c);
3458 PyObject *x;
3459
3460 if (w == NULL)
3461 return -1;
3462 x = PyObject_GetItem(mapping, w);
3463 Py_DECREF(w);
3464 if (x == NULL) {
3465 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
3466 /* No mapping found means: use 1:1 mapping. */
3467 PyErr_Clear();
3468 *result = NULL;
3469 return 0;
3470 } else
3471 return -1;
3472 }
3473 else if (x == Py_None) {
3474 *result = x;
3475 return 0;
3476 }
3477 else if (PyInt_Check(x)) {
3478 long value = PyInt_AS_LONG(x);
3479 long max = PyUnicode_GetMax();
3480 if (value < 0 || value > max) {
3481 PyErr_Format(PyExc_TypeError,
3482 "character mapping must be in range(0x%lx)", max+1);
3483 Py_DECREF(x);
3484 return -1;
3485 }
3486 *result = x;
3487 return 0;
3488 }
3489 else if (PyUnicode_Check(x)) {
3490 *result = x;
3491 return 0;
3492 }
3493 else {
3494 /* wrong return value */
3495 PyErr_SetString(PyExc_TypeError,
3496 "character mapping must return integer, None or unicode");
Walter Dörwald150523e2003-08-15 16:52:19 +00003497 Py_DECREF(x);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003498 return -1;
3499 }
3500}
3501/* ensure that *outobj is at least requiredsize characters long,
3502if not reallocate and adjust various state variables.
3503Return 0 on success, -1 on error */
3504static
Walter Dörwald4894c302003-10-24 14:25:28 +00003505int charmaptranslate_makespace(PyObject **outobj, Py_UNICODE **outp,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003506 Py_ssize_t requiredsize)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003507{
Martin v. Löwis18e16552006-02-15 17:27:45 +00003508 Py_ssize_t oldsize = PyUnicode_GET_SIZE(*outobj);
Walter Dörwald4894c302003-10-24 14:25:28 +00003509 if (requiredsize > oldsize) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003510 /* remember old output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003511 Py_ssize_t outpos = *outp-PyUnicode_AS_UNICODE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003512 /* exponentially overallocate to minimize reallocations */
Walter Dörwald4894c302003-10-24 14:25:28 +00003513 if (requiredsize < 2 * oldsize)
3514 requiredsize = 2 * oldsize;
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00003515 if (_PyUnicode_Resize(outobj, requiredsize) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003516 return -1;
3517 *outp = PyUnicode_AS_UNICODE(*outobj) + outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003518 }
3519 return 0;
3520}
3521/* lookup the character, put the result in the output string and adjust
3522 various state variables. Return a new reference to the object that
3523 was put in the output buffer in *result, or Py_None, if the mapping was
3524 undefined (in which case no character was written).
3525 The called must decref result.
3526 Return 0 on success, -1 on error. */
3527static
Walter Dörwald4894c302003-10-24 14:25:28 +00003528int charmaptranslate_output(const Py_UNICODE *startinp, const Py_UNICODE *curinp,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003529 Py_ssize_t insize, PyObject *mapping, PyObject **outobj, Py_UNICODE **outp,
Walter Dörwald4894c302003-10-24 14:25:28 +00003530 PyObject **res)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003531{
Walter Dörwald4894c302003-10-24 14:25:28 +00003532 if (charmaptranslate_lookup(*curinp, mapping, res))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003533 return -1;
3534 if (*res==NULL) {
3535 /* not found => default to 1:1 mapping */
Walter Dörwald4894c302003-10-24 14:25:28 +00003536 *(*outp)++ = *curinp;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003537 }
3538 else if (*res==Py_None)
3539 ;
3540 else if (PyInt_Check(*res)) {
3541 /* no overflow check, because we know that the space is enough */
3542 *(*outp)++ = (Py_UNICODE)PyInt_AS_LONG(*res);
3543 }
3544 else if (PyUnicode_Check(*res)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00003545 Py_ssize_t repsize = PyUnicode_GET_SIZE(*res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003546 if (repsize==1) {
3547 /* no overflow check, because we know that the space is enough */
3548 *(*outp)++ = *PyUnicode_AS_UNICODE(*res);
3549 }
3550 else if (repsize!=0) {
3551 /* more than one character */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003552 Py_ssize_t requiredsize = (*outp-PyUnicode_AS_UNICODE(*outobj)) +
Walter Dörwaldcd736e72004-02-05 17:36:00 +00003553 (insize - (curinp-startinp)) +
Walter Dörwald4894c302003-10-24 14:25:28 +00003554 repsize - 1;
3555 if (charmaptranslate_makespace(outobj, outp, requiredsize))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003556 return -1;
3557 memcpy(*outp, PyUnicode_AS_UNICODE(*res), sizeof(Py_UNICODE)*repsize);
3558 *outp += repsize;
3559 }
3560 }
3561 else
3562 return -1;
3563 return 0;
3564}
3565
3566PyObject *PyUnicode_TranslateCharmap(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003567 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003568 PyObject *mapping,
3569 const char *errors)
3570{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003571 /* output object */
3572 PyObject *res = NULL;
3573 /* pointers to the beginning and end+1 of input */
3574 const Py_UNICODE *startp = p;
3575 const Py_UNICODE *endp = p + size;
3576 /* pointer into the output */
3577 Py_UNICODE *str;
3578 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003579 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003580 char *reason = "character maps to <undefined>";
3581 PyObject *errorHandler = NULL;
3582 PyObject *exc = NULL;
3583 /* the following variable is used for caching string comparisons
3584 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
3585 * 3=ignore, 4=xmlcharrefreplace */
3586 int known_errorHandler = -1;
3587
Guido van Rossumd57fd912000-03-10 22:53:23 +00003588 if (mapping == NULL) {
3589 PyErr_BadArgument();
3590 return NULL;
3591 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003592
3593 /* allocate enough for a simple 1:1 translation without
3594 replacements, if we need more, we'll resize */
3595 res = PyUnicode_FromUnicode(NULL, size);
3596 if (res == NULL)
Walter Dörwald4894c302003-10-24 14:25:28 +00003597 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003598 if (size == 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003599 return res;
3600 str = PyUnicode_AS_UNICODE(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003601
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003602 while (p<endp) {
3603 /* try to encode it */
3604 PyObject *x = NULL;
Walter Dörwald4894c302003-10-24 14:25:28 +00003605 if (charmaptranslate_output(startp, p, size, mapping, &res, &str, &x)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003606 Py_XDECREF(x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003607 goto onError;
3608 }
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00003609 Py_XDECREF(x);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003610 if (x!=Py_None) /* it worked => adjust input pointer */
3611 ++p;
3612 else { /* untranslatable character */
3613 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003614 Py_ssize_t repsize;
3615 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003616 Py_UNICODE *uni2;
3617 /* startpos for collecting untranslatable chars */
3618 const Py_UNICODE *collstart = p;
3619 const Py_UNICODE *collend = p+1;
3620 const Py_UNICODE *coll;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003621
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003622 /* find all untranslatable characters */
3623 while (collend < endp) {
Walter Dörwald4894c302003-10-24 14:25:28 +00003624 if (charmaptranslate_lookup(*collend, mapping, &x))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003625 goto onError;
3626 Py_XDECREF(x);
3627 if (x!=Py_None)
3628 break;
3629 ++collend;
3630 }
3631 /* cache callback name lookup
3632 * (if not done yet, i.e. it's the first error) */
3633 if (known_errorHandler==-1) {
3634 if ((errors==NULL) || (!strcmp(errors, "strict")))
3635 known_errorHandler = 1;
3636 else if (!strcmp(errors, "replace"))
3637 known_errorHandler = 2;
3638 else if (!strcmp(errors, "ignore"))
3639 known_errorHandler = 3;
3640 else if (!strcmp(errors, "xmlcharrefreplace"))
3641 known_errorHandler = 4;
3642 else
3643 known_errorHandler = 0;
3644 }
3645 switch (known_errorHandler) {
3646 case 1: /* strict */
3647 raise_translate_exception(&exc, startp, size, collstart-startp, collend-startp, reason);
3648 goto onError;
3649 case 2: /* replace */
3650 /* No need to check for space, this is a 1:1 replacement */
3651 for (coll = collstart; coll<collend; ++coll)
3652 *str++ = '?';
3653 /* fall through */
3654 case 3: /* ignore */
3655 p = collend;
3656 break;
3657 case 4: /* xmlcharrefreplace */
3658 /* generate replacement (temporarily (mis)uses p) */
3659 for (p = collstart; p < collend; ++p) {
3660 char buffer[2+29+1+1];
3661 char *cp;
3662 sprintf(buffer, "&#%d;", (int)*p);
Walter Dörwald4894c302003-10-24 14:25:28 +00003663 if (charmaptranslate_makespace(&res, &str,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003664 (str-PyUnicode_AS_UNICODE(res))+strlen(buffer)+(endp-collend)))
3665 goto onError;
3666 for (cp = buffer; *cp; ++cp)
3667 *str++ = *cp;
3668 }
3669 p = collend;
3670 break;
3671 default:
3672 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
3673 reason, startp, size, &exc,
3674 collstart-startp, collend-startp, &newpos);
3675 if (repunicode == NULL)
3676 goto onError;
3677 /* generate replacement */
3678 repsize = PyUnicode_GET_SIZE(repunicode);
Walter Dörwald4894c302003-10-24 14:25:28 +00003679 if (charmaptranslate_makespace(&res, &str,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003680 (str-PyUnicode_AS_UNICODE(res))+repsize+(endp-collend))) {
3681 Py_DECREF(repunicode);
3682 goto onError;
3683 }
3684 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2)
3685 *str++ = *uni2;
3686 p = startp + newpos;
3687 Py_DECREF(repunicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003688 }
3689 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003690 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003691 /* Resize if we allocated to much */
3692 respos = str-PyUnicode_AS_UNICODE(res);
Walter Dörwald4894c302003-10-24 14:25:28 +00003693 if (respos<PyUnicode_GET_SIZE(res)) {
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00003694 if (_PyUnicode_Resize(&res, respos) < 0)
Guido van Rossumfd4b9572000-04-10 13:51:10 +00003695 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003696 }
3697 Py_XDECREF(exc);
3698 Py_XDECREF(errorHandler);
3699 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003700
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003701 onError:
3702 Py_XDECREF(res);
3703 Py_XDECREF(exc);
3704 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003705 return NULL;
3706}
3707
3708PyObject *PyUnicode_Translate(PyObject *str,
3709 PyObject *mapping,
3710 const char *errors)
3711{
3712 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00003713
Guido van Rossumd57fd912000-03-10 22:53:23 +00003714 str = PyUnicode_FromObject(str);
3715 if (str == NULL)
3716 goto onError;
3717 result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str),
3718 PyUnicode_GET_SIZE(str),
3719 mapping,
3720 errors);
3721 Py_DECREF(str);
3722 return result;
Tim Petersced69f82003-09-16 20:30:58 +00003723
Guido van Rossumd57fd912000-03-10 22:53:23 +00003724 onError:
3725 Py_XDECREF(str);
3726 return NULL;
3727}
Tim Petersced69f82003-09-16 20:30:58 +00003728
Guido van Rossum9e896b32000-04-05 20:11:21 +00003729/* --- Decimal Encoder ---------------------------------------------------- */
3730
3731int PyUnicode_EncodeDecimal(Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003732 Py_ssize_t length,
Guido van Rossum9e896b32000-04-05 20:11:21 +00003733 char *output,
3734 const char *errors)
3735{
3736 Py_UNICODE *p, *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003737 PyObject *errorHandler = NULL;
3738 PyObject *exc = NULL;
3739 const char *encoding = "decimal";
3740 const char *reason = "invalid decimal Unicode string";
3741 /* the following variable is used for caching string comparisons
3742 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
3743 int known_errorHandler = -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00003744
3745 if (output == NULL) {
3746 PyErr_BadArgument();
3747 return -1;
3748 }
3749
3750 p = s;
3751 end = s + length;
3752 while (p < end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003753 register Py_UNICODE ch = *p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00003754 int decimal;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003755 PyObject *repunicode;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003756 Py_ssize_t repsize;
3757 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003758 Py_UNICODE *uni2;
3759 Py_UNICODE *collstart;
3760 Py_UNICODE *collend;
Tim Petersced69f82003-09-16 20:30:58 +00003761
Guido van Rossum9e896b32000-04-05 20:11:21 +00003762 if (Py_UNICODE_ISSPACE(ch)) {
3763 *output++ = ' ';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003764 ++p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00003765 continue;
3766 }
3767 decimal = Py_UNICODE_TODECIMAL(ch);
3768 if (decimal >= 0) {
3769 *output++ = '0' + decimal;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003770 ++p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00003771 continue;
3772 }
Guido van Rossumba477042000-04-06 18:18:10 +00003773 if (0 < ch && ch < 256) {
Guido van Rossum42c29aa2000-05-03 23:58:29 +00003774 *output++ = (char)ch;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003775 ++p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00003776 continue;
3777 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003778 /* All other characters are considered unencodable */
3779 collstart = p;
3780 collend = p+1;
3781 while (collend < end) {
3782 if ((0 < *collend && *collend < 256) ||
3783 !Py_UNICODE_ISSPACE(*collend) ||
3784 Py_UNICODE_TODECIMAL(*collend))
3785 break;
Guido van Rossum9e896b32000-04-05 20:11:21 +00003786 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003787 /* cache callback name lookup
3788 * (if not done yet, i.e. it's the first error) */
3789 if (known_errorHandler==-1) {
3790 if ((errors==NULL) || (!strcmp(errors, "strict")))
3791 known_errorHandler = 1;
3792 else if (!strcmp(errors, "replace"))
3793 known_errorHandler = 2;
3794 else if (!strcmp(errors, "ignore"))
3795 known_errorHandler = 3;
3796 else if (!strcmp(errors, "xmlcharrefreplace"))
3797 known_errorHandler = 4;
3798 else
3799 known_errorHandler = 0;
3800 }
3801 switch (known_errorHandler) {
3802 case 1: /* strict */
3803 raise_encode_exception(&exc, encoding, s, length, collstart-s, collend-s, reason);
3804 goto onError;
3805 case 2: /* replace */
3806 for (p = collstart; p < collend; ++p)
3807 *output++ = '?';
3808 /* fall through */
3809 case 3: /* ignore */
3810 p = collend;
3811 break;
3812 case 4: /* xmlcharrefreplace */
3813 /* generate replacement (temporarily (mis)uses p) */
3814 for (p = collstart; p < collend; ++p)
3815 output += sprintf(output, "&#%d;", (int)*p);
3816 p = collend;
3817 break;
3818 default:
3819 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
3820 encoding, reason, s, length, &exc,
3821 collstart-s, collend-s, &newpos);
3822 if (repunicode == NULL)
3823 goto onError;
3824 /* generate replacement */
3825 repsize = PyUnicode_GET_SIZE(repunicode);
3826 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
3827 Py_UNICODE ch = *uni2;
3828 if (Py_UNICODE_ISSPACE(ch))
3829 *output++ = ' ';
3830 else {
3831 decimal = Py_UNICODE_TODECIMAL(ch);
3832 if (decimal >= 0)
3833 *output++ = '0' + decimal;
3834 else if (0 < ch && ch < 256)
3835 *output++ = (char)ch;
3836 else {
3837 Py_DECREF(repunicode);
3838 raise_encode_exception(&exc, encoding,
3839 s, length, collstart-s, collend-s, reason);
3840 goto onError;
3841 }
3842 }
3843 }
3844 p = s + newpos;
3845 Py_DECREF(repunicode);
Guido van Rossum9e896b32000-04-05 20:11:21 +00003846 }
3847 }
3848 /* 0-terminate the output string */
3849 *output++ = '\0';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003850 Py_XDECREF(exc);
3851 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00003852 return 0;
3853
3854 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003855 Py_XDECREF(exc);
3856 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00003857 return -1;
3858}
3859
Guido van Rossumd57fd912000-03-10 22:53:23 +00003860/* --- Helpers ------------------------------------------------------------ */
3861
Fredrik Lundh6471ee42006-05-24 14:28:11 +00003862#define USE_FAST /* experimental fast search implementation */
3863
3864/* fast search/count implementation, based on a mix between boyer-
3865 moore and horspool, with a few more bells and whistles on the top.
3866 for some more background, see: http://effbot.org/stringlib */
3867
Fredrik Lundhd5e0dc52006-05-24 15:11:01 +00003868/* note: fastsearch may access s[n], which isn't a problem when using
3869 Python's ordinary string types. also, the count mode returns -1 if
3870 there cannot possible be a match in the target string, and 0 if it
3871 has actually checked for matches. */
3872
Fredrik Lundh6471ee42006-05-24 14:28:11 +00003873#define FAST_COUNT 0
3874#define FAST_SEARCH 1
3875
Fredrik Lundhd5e0dc52006-05-24 15:11:01 +00003876LOCAL(Py_ssize_t)
3877fastsearch(Py_UNICODE* s, Py_ssize_t n, Py_UNICODE* p, Py_ssize_t m, int mode)
Fredrik Lundh6471ee42006-05-24 14:28:11 +00003878{
3879 long mask;
3880 int skip, count = 0;
3881 Py_ssize_t i, j, mlast, w;
3882
3883 w = n - m;
3884
3885 if (w < 0)
3886 return -1;
3887
3888 /* look for special cases */
3889 if (m <= 1) {
Fredrik Lundhd5e0dc52006-05-24 15:11:01 +00003890 if (m <= 0)
Fredrik Lundh6471ee42006-05-24 14:28:11 +00003891 return -1;
3892 /* use special case for 1-character strings */
3893 if (mode == FAST_COUNT) {
3894 for (i = 0; i < n; i++)
3895 if (s[i] == p[0])
3896 count++;
3897 return count;
3898 } else {
3899 for (i = 0; i < n; i++)
3900 if (s[i] == p[0])
3901 return i;
3902 }
3903 return -1;
3904 }
3905
3906 mlast = m - 1;
3907
3908 /* create compressed boyer-moore delta 1 table */
3909 skip = mlast - 1;
3910 /* process pattern[:-1] */
3911 for (mask = i = 0; i < mlast; i++) {
3912 mask |= (1 << (p[i] & 0x1F));
3913 if (p[i] == p[mlast])
3914 skip = mlast - i - 1;
3915 }
3916 /* process pattern[-1] outside the loop */
3917 mask |= (1 << (p[mlast] & 0x1F));
3918
3919 for (i = 0; i <= w; i++) {
3920 /* note: using mlast in the skip path slows things down on x86 */
3921 if (s[i+m-1] == p[m-1]) {
3922 /* candidate match */
3923 for (j = 0; j < mlast; j++)
3924 if (s[i+j] != p[j])
3925 break;
3926 if (j == mlast) {
3927 /* got a match! */
3928 if (mode != FAST_COUNT)
3929 return i;
3930 count++;
3931 i = i + mlast;
3932 continue;
3933 }
3934 /* miss: check if next character is part of pattern */
3935 if (!(mask & (1 << (s[i+m] & 0x1F))))
3936 i = i + m;
3937 else {
3938 i = i + skip;
3939 continue;
3940 }
3941 } else {
3942 /* skip: check if next character is part of pattern */
3943 if (!(mask & (1 << (s[i+m] & 0x1F))))
3944 i = i + m;
3945 }
3946 }
3947
3948 if (mode != FAST_COUNT)
3949 return -1;
3950 return count;
3951}
3952
3953LOCAL(Py_ssize_t) count(PyUnicodeObject *self,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003954 Py_ssize_t start,
3955 Py_ssize_t end,
3956 PyUnicodeObject *substring)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003957{
Martin v. Löwis412fb672006-04-13 06:34:32 +00003958 Py_ssize_t count = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003959
Marc-André Lemburg3a645e42001-01-16 11:54:12 +00003960 if (start < 0)
3961 start += self->length;
3962 if (start < 0)
3963 start = 0;
3964 if (end > self->length)
3965 end = self->length;
3966 if (end < 0)
3967 end += self->length;
3968 if (end < 0)
3969 end = 0;
3970
Marc-André Lemburg49ef6dc2000-06-18 22:25:22 +00003971 if (substring->length == 0)
3972 return (end - start + 1);
3973
Fredrik Lundh6471ee42006-05-24 14:28:11 +00003974#ifdef USE_FAST
3975 count = fastsearch(
3976 PyUnicode_AS_UNICODE(self) + start, end - start,
3977 substring->str, substring->length, FAST_COUNT
3978 );
3979 if (count < 0)
3980 count = 0; /* no match */
3981#else
Guido van Rossumd57fd912000-03-10 22:53:23 +00003982 end -= substring->length;
3983
3984 while (start <= end)
3985 if (Py_UNICODE_MATCH(self, start, substring)) {
3986 count++;
3987 start += substring->length;
3988 } else
3989 start++;
Fredrik Lundh6471ee42006-05-24 14:28:11 +00003990#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00003991
3992 return count;
3993}
3994
Martin v. Löwis18e16552006-02-15 17:27:45 +00003995Py_ssize_t PyUnicode_Count(PyObject *str,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003996 PyObject *substr,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003997 Py_ssize_t start,
3998 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003999{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004000 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00004001
Guido van Rossumd57fd912000-03-10 22:53:23 +00004002 str = PyUnicode_FromObject(str);
4003 if (str == NULL)
4004 return -1;
4005 substr = PyUnicode_FromObject(substr);
4006 if (substr == NULL) {
Marc-André Lemburg49ef6dc2000-06-18 22:25:22 +00004007 Py_DECREF(str);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004008 return -1;
4009 }
Tim Petersced69f82003-09-16 20:30:58 +00004010
Guido van Rossumd57fd912000-03-10 22:53:23 +00004011 result = count((PyUnicodeObject *)str,
4012 start, end,
4013 (PyUnicodeObject *)substr);
Tim Petersced69f82003-09-16 20:30:58 +00004014
Guido van Rossumd57fd912000-03-10 22:53:23 +00004015 Py_DECREF(str);
4016 Py_DECREF(substr);
4017 return result;
4018}
4019
Fredrik Lundhb63588c2006-05-23 18:44:25 +00004020static Py_ssize_t findstring(PyUnicodeObject *self,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004021 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004022 Py_ssize_t start,
4023 Py_ssize_t end,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004024 int direction)
4025{
4026 if (start < 0)
4027 start += self->length;
4028 if (start < 0)
4029 start = 0;
4030
Guido van Rossumd57fd912000-03-10 22:53:23 +00004031 if (end > self->length)
4032 end = self->length;
4033 if (end < 0)
4034 end += self->length;
4035 if (end < 0)
4036 end = 0;
4037
Guido van Rossum76afbd92002-08-20 17:29:29 +00004038 if (substring->length == 0)
4039 return (direction > 0) ? start : end;
4040
Fredrik Lundh6471ee42006-05-24 14:28:11 +00004041#ifdef USE_FAST
4042 if (direction > 0) {
4043 Py_ssize_t pos = fastsearch(
4044 PyUnicode_AS_UNICODE(self) + start, end - start,
4045 substring->str, substring->length, FAST_SEARCH
4046 );
4047 if (pos < 0)
4048 return pos;
4049 return pos + start;
4050 }
4051#endif
4052
Guido van Rossumd57fd912000-03-10 22:53:23 +00004053 end -= substring->length;
4054
4055 if (direction < 0) {
4056 for (; end >= start; end--)
4057 if (Py_UNICODE_MATCH(self, end, substring))
4058 return end;
4059 } else {
4060 for (; start <= end; start++)
4061 if (Py_UNICODE_MATCH(self, start, substring))
4062 return start;
4063 }
4064
4065 return -1;
4066}
4067
Martin v. Löwis18e16552006-02-15 17:27:45 +00004068Py_ssize_t PyUnicode_Find(PyObject *str,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004069 PyObject *substr,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004070 Py_ssize_t start,
4071 Py_ssize_t end,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004072 int direction)
4073{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004074 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00004075
Guido van Rossumd57fd912000-03-10 22:53:23 +00004076 str = PyUnicode_FromObject(str);
4077 if (str == NULL)
Marc-André Lemburg4da6fd62002-05-29 11:33:13 +00004078 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004079 substr = PyUnicode_FromObject(substr);
4080 if (substr == NULL) {
Marc-André Lemburg41644392002-05-29 13:46:29 +00004081 Py_DECREF(str);
Marc-André Lemburg4da6fd62002-05-29 11:33:13 +00004082 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004083 }
Tim Petersced69f82003-09-16 20:30:58 +00004084
Guido van Rossumd57fd912000-03-10 22:53:23 +00004085 result = findstring((PyUnicodeObject *)str,
4086 (PyUnicodeObject *)substr,
4087 start, end, direction);
4088 Py_DECREF(str);
4089 Py_DECREF(substr);
4090 return result;
4091}
4092
Tim Petersced69f82003-09-16 20:30:58 +00004093static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004094int tailmatch(PyUnicodeObject *self,
4095 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004096 Py_ssize_t start,
4097 Py_ssize_t end,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004098 int direction)
4099{
4100 if (start < 0)
4101 start += self->length;
4102 if (start < 0)
4103 start = 0;
4104
4105 if (substring->length == 0)
4106 return 1;
4107
4108 if (end > self->length)
4109 end = self->length;
4110 if (end < 0)
4111 end += self->length;
4112 if (end < 0)
4113 end = 0;
4114
4115 end -= substring->length;
4116 if (end < start)
4117 return 0;
4118
4119 if (direction > 0) {
4120 if (Py_UNICODE_MATCH(self, end, substring))
4121 return 1;
4122 } else {
4123 if (Py_UNICODE_MATCH(self, start, substring))
4124 return 1;
4125 }
4126
4127 return 0;
4128}
4129
Martin v. Löwis18e16552006-02-15 17:27:45 +00004130Py_ssize_t PyUnicode_Tailmatch(PyObject *str,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004131 PyObject *substr,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004132 Py_ssize_t start,
4133 Py_ssize_t end,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004134 int direction)
4135{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004136 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00004137
Guido van Rossumd57fd912000-03-10 22:53:23 +00004138 str = PyUnicode_FromObject(str);
4139 if (str == NULL)
4140 return -1;
4141 substr = PyUnicode_FromObject(substr);
4142 if (substr == NULL) {
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00004143 Py_DECREF(str);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004144 return -1;
4145 }
Tim Petersced69f82003-09-16 20:30:58 +00004146
Guido van Rossumd57fd912000-03-10 22:53:23 +00004147 result = tailmatch((PyUnicodeObject *)str,
4148 (PyUnicodeObject *)substr,
4149 start, end, direction);
4150 Py_DECREF(str);
4151 Py_DECREF(substr);
4152 return result;
4153}
4154
Guido van Rossumd57fd912000-03-10 22:53:23 +00004155/* Apply fixfct filter to the Unicode object self and return a
4156 reference to the modified object */
4157
Tim Petersced69f82003-09-16 20:30:58 +00004158static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004159PyObject *fixup(PyUnicodeObject *self,
4160 int (*fixfct)(PyUnicodeObject *s))
4161{
4162
4163 PyUnicodeObject *u;
4164
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004165 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004166 if (u == NULL)
4167 return NULL;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004168
4169 Py_UNICODE_COPY(u->str, self->str, self->length);
4170
Tim Peters7a29bd52001-09-12 03:03:31 +00004171 if (!fixfct(u) && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004172 /* fixfct should return TRUE if it modified the buffer. If
4173 FALSE, return a reference to the original buffer instead
4174 (to save space, not time) */
4175 Py_INCREF(self);
4176 Py_DECREF(u);
4177 return (PyObject*) self;
4178 }
4179 return (PyObject*) u;
4180}
4181
Tim Petersced69f82003-09-16 20:30:58 +00004182static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004183int fixupper(PyUnicodeObject *self)
4184{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004185 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004186 Py_UNICODE *s = self->str;
4187 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00004188
Guido van Rossumd57fd912000-03-10 22:53:23 +00004189 while (len-- > 0) {
4190 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00004191
Guido van Rossumd57fd912000-03-10 22:53:23 +00004192 ch = Py_UNICODE_TOUPPER(*s);
4193 if (ch != *s) {
4194 status = 1;
4195 *s = ch;
4196 }
4197 s++;
4198 }
4199
4200 return status;
4201}
4202
Tim Petersced69f82003-09-16 20:30:58 +00004203static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004204int fixlower(PyUnicodeObject *self)
4205{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004206 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004207 Py_UNICODE *s = self->str;
4208 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00004209
Guido van Rossumd57fd912000-03-10 22:53:23 +00004210 while (len-- > 0) {
4211 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00004212
Guido van Rossumd57fd912000-03-10 22:53:23 +00004213 ch = Py_UNICODE_TOLOWER(*s);
4214 if (ch != *s) {
4215 status = 1;
4216 *s = ch;
4217 }
4218 s++;
4219 }
4220
4221 return status;
4222}
4223
Tim Petersced69f82003-09-16 20:30:58 +00004224static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004225int fixswapcase(PyUnicodeObject *self)
4226{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004227 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004228 Py_UNICODE *s = self->str;
4229 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00004230
Guido van Rossumd57fd912000-03-10 22:53:23 +00004231 while (len-- > 0) {
4232 if (Py_UNICODE_ISUPPER(*s)) {
4233 *s = Py_UNICODE_TOLOWER(*s);
4234 status = 1;
4235 } else if (Py_UNICODE_ISLOWER(*s)) {
4236 *s = Py_UNICODE_TOUPPER(*s);
4237 status = 1;
4238 }
4239 s++;
4240 }
4241
4242 return status;
4243}
4244
Tim Petersced69f82003-09-16 20:30:58 +00004245static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004246int fixcapitalize(PyUnicodeObject *self)
4247{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004248 Py_ssize_t len = self->length;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00004249 Py_UNICODE *s = self->str;
4250 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00004251
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00004252 if (len == 0)
4253 return 0;
4254 if (Py_UNICODE_ISLOWER(*s)) {
4255 *s = Py_UNICODE_TOUPPER(*s);
4256 status = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004257 }
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00004258 s++;
4259 while (--len > 0) {
4260 if (Py_UNICODE_ISUPPER(*s)) {
4261 *s = Py_UNICODE_TOLOWER(*s);
4262 status = 1;
4263 }
4264 s++;
4265 }
4266 return status;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004267}
4268
4269static
4270int fixtitle(PyUnicodeObject *self)
4271{
4272 register Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
4273 register Py_UNICODE *e;
4274 int previous_is_cased;
4275
4276 /* Shortcut for single character strings */
4277 if (PyUnicode_GET_SIZE(self) == 1) {
4278 Py_UNICODE ch = Py_UNICODE_TOTITLE(*p);
4279 if (*p != ch) {
4280 *p = ch;
4281 return 1;
4282 }
4283 else
4284 return 0;
4285 }
Tim Petersced69f82003-09-16 20:30:58 +00004286
Guido van Rossumd57fd912000-03-10 22:53:23 +00004287 e = p + PyUnicode_GET_SIZE(self);
4288 previous_is_cased = 0;
4289 for (; p < e; p++) {
4290 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00004291
Guido van Rossumd57fd912000-03-10 22:53:23 +00004292 if (previous_is_cased)
4293 *p = Py_UNICODE_TOLOWER(ch);
4294 else
4295 *p = Py_UNICODE_TOTITLE(ch);
Tim Petersced69f82003-09-16 20:30:58 +00004296
4297 if (Py_UNICODE_ISLOWER(ch) ||
4298 Py_UNICODE_ISUPPER(ch) ||
Guido van Rossumd57fd912000-03-10 22:53:23 +00004299 Py_UNICODE_ISTITLE(ch))
4300 previous_is_cased = 1;
4301 else
4302 previous_is_cased = 0;
4303 }
4304 return 1;
4305}
4306
Tim Peters8ce9f162004-08-27 01:49:32 +00004307PyObject *
4308PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004309{
Tim Peters8ce9f162004-08-27 01:49:32 +00004310 PyObject *internal_separator = NULL;
Skip Montanaro6543b452004-09-16 03:28:13 +00004311 const Py_UNICODE blank = ' ';
4312 const Py_UNICODE *sep = &blank;
Tim Peters286085c2006-05-22 19:17:04 +00004313 Py_ssize_t seplen = 1;
Tim Peters05eba1f2004-08-27 21:32:02 +00004314 PyUnicodeObject *res = NULL; /* the result */
Tim Peters286085c2006-05-22 19:17:04 +00004315 Py_ssize_t res_alloc = 100; /* # allocated bytes for string in res */
4316 Py_ssize_t res_used; /* # used bytes */
Tim Peters05eba1f2004-08-27 21:32:02 +00004317 Py_UNICODE *res_p; /* pointer to free byte in res's string area */
4318 PyObject *fseq; /* PySequence_Fast(seq) */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004319 Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */
Tim Peters8ce9f162004-08-27 01:49:32 +00004320 PyObject *item;
Martin v. Löwis412fb672006-04-13 06:34:32 +00004321 Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004322
Tim Peters05eba1f2004-08-27 21:32:02 +00004323 fseq = PySequence_Fast(seq, "");
4324 if (fseq == NULL) {
Tim Peters05eba1f2004-08-27 21:32:02 +00004325 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +00004326 }
4327
Tim Peters91879ab2004-08-27 22:35:44 +00004328 /* Grrrr. A codec may be invoked to convert str objects to
4329 * Unicode, and so it's possible to call back into Python code
4330 * during PyUnicode_FromObject(), and so it's possible for a sick
4331 * codec to change the size of fseq (if seq is a list). Therefore
4332 * we have to keep refetching the size -- can't assume seqlen
4333 * is invariant.
4334 */
Tim Peters05eba1f2004-08-27 21:32:02 +00004335 seqlen = PySequence_Fast_GET_SIZE(fseq);
4336 /* If empty sequence, return u"". */
4337 if (seqlen == 0) {
4338 res = _PyUnicode_New(0); /* empty sequence; return u"" */
4339 goto Done;
4340 }
4341 /* If singleton sequence with an exact Unicode, return that. */
4342 if (seqlen == 1) {
4343 item = PySequence_Fast_GET_ITEM(fseq, 0);
4344 if (PyUnicode_CheckExact(item)) {
4345 Py_INCREF(item);
4346 res = (PyUnicodeObject *)item;
4347 goto Done;
4348 }
Tim Peters8ce9f162004-08-27 01:49:32 +00004349 }
4350
Tim Peters05eba1f2004-08-27 21:32:02 +00004351 /* At least two items to join, or one that isn't exact Unicode. */
4352 if (seqlen > 1) {
4353 /* Set up sep and seplen -- they're needed. */
4354 if (separator == NULL) {
4355 sep = &blank;
4356 seplen = 1;
4357 }
4358 else {
4359 internal_separator = PyUnicode_FromObject(separator);
4360 if (internal_separator == NULL)
4361 goto onError;
4362 sep = PyUnicode_AS_UNICODE(internal_separator);
4363 seplen = PyUnicode_GET_SIZE(internal_separator);
Tim Peters91879ab2004-08-27 22:35:44 +00004364 /* In case PyUnicode_FromObject() mutated seq. */
4365 seqlen = PySequence_Fast_GET_SIZE(fseq);
Tim Peters05eba1f2004-08-27 21:32:02 +00004366 }
4367 }
4368
4369 /* Get space. */
Martin v. Löwis412fb672006-04-13 06:34:32 +00004370 res = _PyUnicode_New(res_alloc);
Tim Peters05eba1f2004-08-27 21:32:02 +00004371 if (res == NULL)
Tim Peters8ce9f162004-08-27 01:49:32 +00004372 goto onError;
Tim Peters05eba1f2004-08-27 21:32:02 +00004373 res_p = PyUnicode_AS_UNICODE(res);
4374 res_used = 0;
Tim Peters8ce9f162004-08-27 01:49:32 +00004375
Tim Peters05eba1f2004-08-27 21:32:02 +00004376 for (i = 0; i < seqlen; ++i) {
Tim Peters286085c2006-05-22 19:17:04 +00004377 Py_ssize_t itemlen;
4378 Py_ssize_t new_res_used;
Tim Peters05eba1f2004-08-27 21:32:02 +00004379
4380 item = PySequence_Fast_GET_ITEM(fseq, i);
4381 /* Convert item to Unicode. */
4382 if (! PyUnicode_Check(item) && ! PyString_Check(item)) {
4383 PyErr_Format(PyExc_TypeError,
Thomas Wouters715a4cd2006-04-16 22:04:49 +00004384 "sequence item %zd: expected string or Unicode,"
Tim Peters05eba1f2004-08-27 21:32:02 +00004385 " %.80s found",
4386 i, item->ob_type->tp_name);
Tim Peters2cfe3682001-05-05 05:36:48 +00004387 goto onError;
Tim Peters8ce9f162004-08-27 01:49:32 +00004388 }
Tim Peters05eba1f2004-08-27 21:32:02 +00004389 item = PyUnicode_FromObject(item);
4390 if (item == NULL)
4391 goto onError;
4392 /* We own a reference to item from here on. */
Tim Petersced69f82003-09-16 20:30:58 +00004393
Tim Peters91879ab2004-08-27 22:35:44 +00004394 /* In case PyUnicode_FromObject() mutated seq. */
4395 seqlen = PySequence_Fast_GET_SIZE(fseq);
4396
Tim Peters8ce9f162004-08-27 01:49:32 +00004397 /* Make sure we have enough space for the separator and the item. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004398 itemlen = PyUnicode_GET_SIZE(item);
Tim Peters05eba1f2004-08-27 21:32:02 +00004399 new_res_used = res_used + itemlen;
Tim Peters286085c2006-05-22 19:17:04 +00004400 if (new_res_used <= 0)
Tim Peters8ce9f162004-08-27 01:49:32 +00004401 goto Overflow;
Tim Peters05eba1f2004-08-27 21:32:02 +00004402 if (i < seqlen - 1) {
4403 new_res_used += seplen;
Tim Peters286085c2006-05-22 19:17:04 +00004404 if (new_res_used <= 0)
Tim Peters05eba1f2004-08-27 21:32:02 +00004405 goto Overflow;
4406 }
4407 if (new_res_used > res_alloc) {
4408 /* double allocated size until it's big enough */
Tim Peters8ce9f162004-08-27 01:49:32 +00004409 do {
Tim Peters05eba1f2004-08-27 21:32:02 +00004410 res_alloc += res_alloc;
Tim Peters286085c2006-05-22 19:17:04 +00004411 if (res_alloc <= 0)
Tim Peters8ce9f162004-08-27 01:49:32 +00004412 goto Overflow;
Tim Peters05eba1f2004-08-27 21:32:02 +00004413 } while (new_res_used > res_alloc);
Martin v. Löwis412fb672006-04-13 06:34:32 +00004414 if (_PyUnicode_Resize(&res, res_alloc) < 0) {
Marc-André Lemburg3508e302001-09-20 17:22:58 +00004415 Py_DECREF(item);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004416 goto onError;
Marc-André Lemburg3508e302001-09-20 17:22:58 +00004417 }
Tim Peters05eba1f2004-08-27 21:32:02 +00004418 res_p = PyUnicode_AS_UNICODE(res) + res_used;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004419 }
Tim Peters05eba1f2004-08-27 21:32:02 +00004420
4421 /* Copy item, and maybe the separator. */
Martin v. Löwis412fb672006-04-13 06:34:32 +00004422 Py_UNICODE_COPY(res_p, PyUnicode_AS_UNICODE(item), itemlen);
Tim Peters05eba1f2004-08-27 21:32:02 +00004423 res_p += itemlen;
4424 if (i < seqlen - 1) {
Martin v. Löwis412fb672006-04-13 06:34:32 +00004425 Py_UNICODE_COPY(res_p, sep, seplen);
Tim Peters05eba1f2004-08-27 21:32:02 +00004426 res_p += seplen;
4427 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004428 Py_DECREF(item);
Tim Peters05eba1f2004-08-27 21:32:02 +00004429 res_used = new_res_used;
4430 }
Tim Peters8ce9f162004-08-27 01:49:32 +00004431
Tim Peters05eba1f2004-08-27 21:32:02 +00004432 /* Shrink res to match the used area; this probably can't fail,
4433 * but it's cheap to check.
4434 */
Martin v. Löwis412fb672006-04-13 06:34:32 +00004435 if (_PyUnicode_Resize(&res, res_used) < 0)
Tim Peters8ce9f162004-08-27 01:49:32 +00004436 goto onError;
4437
4438 Done:
4439 Py_XDECREF(internal_separator);
Tim Peters05eba1f2004-08-27 21:32:02 +00004440 Py_DECREF(fseq);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004441 return (PyObject *)res;
4442
Tim Peters8ce9f162004-08-27 01:49:32 +00004443 Overflow:
4444 PyErr_SetString(PyExc_OverflowError,
4445 "join() is too long for a Python string");
4446 Py_DECREF(item);
4447 /* fall through */
4448
Guido van Rossumd57fd912000-03-10 22:53:23 +00004449 onError:
Tim Peters8ce9f162004-08-27 01:49:32 +00004450 Py_XDECREF(internal_separator);
Tim Peters05eba1f2004-08-27 21:32:02 +00004451 Py_DECREF(fseq);
Tim Peters8ce9f162004-08-27 01:49:32 +00004452 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004453 return NULL;
4454}
4455
Tim Petersced69f82003-09-16 20:30:58 +00004456static
4457PyUnicodeObject *pad(PyUnicodeObject *self,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004458 Py_ssize_t left,
4459 Py_ssize_t right,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004460 Py_UNICODE fill)
4461{
4462 PyUnicodeObject *u;
4463
4464 if (left < 0)
4465 left = 0;
4466 if (right < 0)
4467 right = 0;
4468
Tim Peters7a29bd52001-09-12 03:03:31 +00004469 if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004470 Py_INCREF(self);
4471 return self;
4472 }
4473
4474 u = _PyUnicode_New(left + self->length + right);
4475 if (u) {
4476 if (left)
4477 Py_UNICODE_FILL(u->str, fill, left);
4478 Py_UNICODE_COPY(u->str + left, self->str, self->length);
4479 if (right)
4480 Py_UNICODE_FILL(u->str + left + self->length, fill, right);
4481 }
4482
4483 return u;
4484}
4485
4486#define SPLIT_APPEND(data, left, right) \
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004487 str = PyUnicode_FromUnicode((data) + (left), (right) - (left)); \
Guido van Rossumd57fd912000-03-10 22:53:23 +00004488 if (!str) \
4489 goto onError; \
4490 if (PyList_Append(list, str)) { \
4491 Py_DECREF(str); \
4492 goto onError; \
4493 } \
4494 else \
4495 Py_DECREF(str);
4496
4497static
4498PyObject *split_whitespace(PyUnicodeObject *self,
4499 PyObject *list,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004500 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004501{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004502 register Py_ssize_t i;
4503 register Py_ssize_t j;
4504 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004505 PyObject *str;
4506
4507 for (i = j = 0; i < len; ) {
4508 /* find a token */
4509 while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
4510 i++;
4511 j = i;
4512 while (i < len && !Py_UNICODE_ISSPACE(self->str[i]))
4513 i++;
4514 if (j < i) {
4515 if (maxcount-- <= 0)
4516 break;
4517 SPLIT_APPEND(self->str, j, i);
4518 while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
4519 i++;
4520 j = i;
4521 }
4522 }
4523 if (j < len) {
4524 SPLIT_APPEND(self->str, j, len);
4525 }
4526 return list;
4527
4528 onError:
4529 Py_DECREF(list);
4530 return NULL;
4531}
4532
4533PyObject *PyUnicode_Splitlines(PyObject *string,
Guido van Rossum86662912000-04-11 15:38:46 +00004534 int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004535{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004536 register Py_ssize_t i;
4537 register Py_ssize_t j;
4538 Py_ssize_t len;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004539 PyObject *list;
4540 PyObject *str;
4541 Py_UNICODE *data;
4542
4543 string = PyUnicode_FromObject(string);
4544 if (string == NULL)
4545 return NULL;
4546 data = PyUnicode_AS_UNICODE(string);
4547 len = PyUnicode_GET_SIZE(string);
4548
Guido van Rossumd57fd912000-03-10 22:53:23 +00004549 list = PyList_New(0);
4550 if (!list)
4551 goto onError;
4552
4553 for (i = j = 0; i < len; ) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00004554 Py_ssize_t eol;
Tim Petersced69f82003-09-16 20:30:58 +00004555
Guido van Rossumd57fd912000-03-10 22:53:23 +00004556 /* Find a line and append it */
Fredrik Lundhb63588c2006-05-23 18:44:25 +00004557 while (i < len && !BLOOM_LINEBREAK(data[i]))
Guido van Rossumd57fd912000-03-10 22:53:23 +00004558 i++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004559
4560 /* Skip the line break reading CRLF as one line break */
Guido van Rossum86662912000-04-11 15:38:46 +00004561 eol = i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004562 if (i < len) {
4563 if (data[i] == '\r' && i + 1 < len &&
4564 data[i+1] == '\n')
4565 i += 2;
4566 else
4567 i++;
Guido van Rossum86662912000-04-11 15:38:46 +00004568 if (keepends)
4569 eol = i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004570 }
Guido van Rossum86662912000-04-11 15:38:46 +00004571 SPLIT_APPEND(data, j, eol);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004572 j = i;
4573 }
4574 if (j < len) {
4575 SPLIT_APPEND(data, j, len);
4576 }
4577
4578 Py_DECREF(string);
4579 return list;
4580
4581 onError:
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00004582 Py_XDECREF(list);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004583 Py_DECREF(string);
4584 return NULL;
4585}
4586
Tim Petersced69f82003-09-16 20:30:58 +00004587static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004588PyObject *split_char(PyUnicodeObject *self,
4589 PyObject *list,
4590 Py_UNICODE ch,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004591 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004592{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004593 register Py_ssize_t i;
4594 register Py_ssize_t j;
4595 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004596 PyObject *str;
4597
4598 for (i = j = 0; i < len; ) {
4599 if (self->str[i] == ch) {
4600 if (maxcount-- <= 0)
4601 break;
4602 SPLIT_APPEND(self->str, j, i);
4603 i = j = i + 1;
4604 } else
4605 i++;
4606 }
4607 if (j <= len) {
4608 SPLIT_APPEND(self->str, j, len);
4609 }
4610 return list;
4611
4612 onError:
4613 Py_DECREF(list);
4614 return NULL;
4615}
4616
Tim Petersced69f82003-09-16 20:30:58 +00004617static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004618PyObject *split_substring(PyUnicodeObject *self,
4619 PyObject *list,
4620 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004621 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004622{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004623 register Py_ssize_t i;
4624 register Py_ssize_t j;
4625 Py_ssize_t len = self->length;
4626 Py_ssize_t sublen = substring->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004627 PyObject *str;
4628
Guido van Rossumcda4f9a2000-12-19 02:23:19 +00004629 for (i = j = 0; i <= len - sublen; ) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004630 if (Py_UNICODE_MATCH(self, i, substring)) {
4631 if (maxcount-- <= 0)
4632 break;
4633 SPLIT_APPEND(self->str, j, i);
4634 i = j = i + sublen;
4635 } else
4636 i++;
4637 }
4638 if (j <= len) {
4639 SPLIT_APPEND(self->str, j, len);
4640 }
4641 return list;
4642
4643 onError:
4644 Py_DECREF(list);
4645 return NULL;
4646}
4647
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004648static
4649PyObject *rsplit_whitespace(PyUnicodeObject *self,
4650 PyObject *list,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004651 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004652{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004653 register Py_ssize_t i;
4654 register Py_ssize_t j;
4655 Py_ssize_t len = self->length;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004656 PyObject *str;
4657
4658 for (i = j = len - 1; i >= 0; ) {
4659 /* find a token */
4660 while (i >= 0 && Py_UNICODE_ISSPACE(self->str[i]))
4661 i--;
4662 j = i;
4663 while (i >= 0 && !Py_UNICODE_ISSPACE(self->str[i]))
4664 i--;
4665 if (j > i) {
4666 if (maxcount-- <= 0)
4667 break;
Fredrik Lundhb63588c2006-05-23 18:44:25 +00004668 SPLIT_APPEND(self->str, i + 1, j + 1);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004669 while (i >= 0 && Py_UNICODE_ISSPACE(self->str[i]))
4670 i--;
4671 j = i;
4672 }
4673 }
4674 if (j >= 0) {
Fredrik Lundhb63588c2006-05-23 18:44:25 +00004675 SPLIT_APPEND(self->str, 0, j + 1);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004676 }
Fredrik Lundhb63588c2006-05-23 18:44:25 +00004677 if (PyList_Reverse(list) < 0)
4678 goto onError;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004679 return list;
4680
4681 onError:
4682 Py_DECREF(list);
4683 return NULL;
4684}
4685
4686static
4687PyObject *rsplit_char(PyUnicodeObject *self,
4688 PyObject *list,
4689 Py_UNICODE ch,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004690 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004691{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004692 register Py_ssize_t i;
4693 register Py_ssize_t j;
4694 Py_ssize_t len = self->length;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004695 PyObject *str;
4696
4697 for (i = j = len - 1; i >= 0; ) {
4698 if (self->str[i] == ch) {
4699 if (maxcount-- <= 0)
4700 break;
Fredrik Lundhb63588c2006-05-23 18:44:25 +00004701 SPLIT_APPEND(self->str, i + 1, j + 1);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004702 j = i = i - 1;
4703 } else
4704 i--;
4705 }
Hye-Shik Chang7fc4cf52003-12-23 09:10:16 +00004706 if (j >= -1) {
Fredrik Lundhb63588c2006-05-23 18:44:25 +00004707 SPLIT_APPEND(self->str, 0, j + 1);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004708 }
Fredrik Lundhb63588c2006-05-23 18:44:25 +00004709 if (PyList_Reverse(list) < 0)
4710 goto onError;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004711 return list;
4712
4713 onError:
4714 Py_DECREF(list);
4715 return NULL;
4716}
4717
4718static
4719PyObject *rsplit_substring(PyUnicodeObject *self,
4720 PyObject *list,
4721 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004722 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004723{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004724 register Py_ssize_t i;
4725 register Py_ssize_t j;
4726 Py_ssize_t len = self->length;
4727 Py_ssize_t sublen = substring->length;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004728 PyObject *str;
4729
4730 for (i = len - sublen, j = len; i >= 0; ) {
4731 if (Py_UNICODE_MATCH(self, i, substring)) {
4732 if (maxcount-- <= 0)
4733 break;
Fredrik Lundhb63588c2006-05-23 18:44:25 +00004734 SPLIT_APPEND(self->str, i + sublen, j);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004735 j = i;
4736 i -= sublen;
4737 } else
4738 i--;
4739 }
4740 if (j >= 0) {
Fredrik Lundhb63588c2006-05-23 18:44:25 +00004741 SPLIT_APPEND(self->str, 0, j);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004742 }
Fredrik Lundhb63588c2006-05-23 18:44:25 +00004743 if (PyList_Reverse(list) < 0)
4744 goto onError;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004745 return list;
4746
4747 onError:
4748 Py_DECREF(list);
4749 return NULL;
4750}
4751
Guido van Rossumd57fd912000-03-10 22:53:23 +00004752#undef SPLIT_APPEND
4753
4754static
4755PyObject *split(PyUnicodeObject *self,
4756 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004757 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004758{
4759 PyObject *list;
4760
4761 if (maxcount < 0)
Martin v. Löwis412fb672006-04-13 06:34:32 +00004762 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004763
4764 list = PyList_New(0);
4765 if (!list)
4766 return NULL;
4767
4768 if (substring == NULL)
4769 return split_whitespace(self,list,maxcount);
4770
4771 else if (substring->length == 1)
4772 return split_char(self,list,substring->str[0],maxcount);
4773
4774 else if (substring->length == 0) {
4775 Py_DECREF(list);
4776 PyErr_SetString(PyExc_ValueError, "empty separator");
4777 return NULL;
4778 }
4779 else
4780 return split_substring(self,list,substring,maxcount);
4781}
4782
Tim Petersced69f82003-09-16 20:30:58 +00004783static
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004784PyObject *rsplit(PyUnicodeObject *self,
4785 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004786 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004787{
4788 PyObject *list;
4789
4790 if (maxcount < 0)
Martin v. Löwis412fb672006-04-13 06:34:32 +00004791 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004792
4793 list = PyList_New(0);
4794 if (!list)
4795 return NULL;
4796
4797 if (substring == NULL)
4798 return rsplit_whitespace(self,list,maxcount);
4799
4800 else if (substring->length == 1)
4801 return rsplit_char(self,list,substring->str[0],maxcount);
4802
4803 else if (substring->length == 0) {
4804 Py_DECREF(list);
4805 PyErr_SetString(PyExc_ValueError, "empty separator");
4806 return NULL;
4807 }
4808 else
4809 return rsplit_substring(self,list,substring,maxcount);
4810}
4811
4812static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004813PyObject *replace(PyUnicodeObject *self,
4814 PyUnicodeObject *str1,
4815 PyUnicodeObject *str2,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004816 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004817{
4818 PyUnicodeObject *u;
4819
4820 if (maxcount < 0)
Martin v. Löwis412fb672006-04-13 06:34:32 +00004821 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004822
Fredrik Lundh347ee272006-05-24 16:35:18 +00004823 if (str1->length == str2->length) {
4824 /* same length */
Martin v. Löwis412fb672006-04-13 06:34:32 +00004825 Py_ssize_t i;
Fredrik Lundh347ee272006-05-24 16:35:18 +00004826 if (str1->length == 1) {
4827 /* replace characters */
4828 Py_UNICODE u1, u2;
4829 if (!findchar(self->str, self->length, str1->str[0]))
4830 goto nothing;
4831 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
4832 if (!u)
4833 return NULL;
4834 Py_UNICODE_COPY(u->str, self->str, self->length);
4835 u1 = str1->str[0];
4836 u2 = str2->str[0];
4837 for (i = 0; i < u->length; i++)
4838 if (u->str[i] == u1) {
4839 if (--maxcount < 0)
4840 break;
4841 u->str[i] = u2;
4842 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004843 } else {
Fredrik Lundh347ee272006-05-24 16:35:18 +00004844 i = fastsearch(
4845 self->str, self->length, str1->str, str1->length, FAST_SEARCH
Guido van Rossumd57fd912000-03-10 22:53:23 +00004846 );
Fredrik Lundh347ee272006-05-24 16:35:18 +00004847 if (i < 0)
4848 goto nothing;
4849 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
4850 if (!u)
4851 return NULL;
4852 Py_UNICODE_COPY(u->str, self->str, self->length);
4853 while (i <= self->length - str1->length)
4854 if (Py_UNICODE_MATCH(self, i, str1)) {
4855 if (--maxcount < 0)
4856 break;
4857 Py_UNICODE_COPY(u->str+i, str2->str, str2->length);
4858 i += str1->length;
4859 } else
4860 i++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004861 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004862 } else {
Fredrik Lundh347ee272006-05-24 16:35:18 +00004863
Martin v. Löwis18e16552006-02-15 17:27:45 +00004864 Py_ssize_t n, i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004865 Py_UNICODE *p;
4866
4867 /* replace strings */
4868 n = count(self, 0, self->length, str1);
4869 if (n > maxcount)
4870 n = maxcount;
Fredrik Lundh347ee272006-05-24 16:35:18 +00004871 if (n == 0)
4872 goto nothing;
4873 u = _PyUnicode_New(self->length + n * (str2->length - str1->length));
4874 if (!u)
4875 return NULL;
4876 i = 0;
4877 p = u->str;
4878 if (str1->length > 0) {
4879 while (i <= self->length - str1->length)
4880 if (Py_UNICODE_MATCH(self, i, str1)) {
4881 /* replace string segment */
4882 Py_UNICODE_COPY(p, str2->str, str2->length);
4883 p += str2->length;
4884 i += str1->length;
4885 if (--n <= 0) {
4886 /* copy remaining part */
4887 Py_UNICODE_COPY(p, self->str+i, self->length-i);
4888 break;
Guido van Rossum8b1a6d62002-08-23 18:21:28 +00004889 }
Fredrik Lundh347ee272006-05-24 16:35:18 +00004890 } else
4891 *p++ = self->str[i++];
4892 } else {
4893 while (n > 0) {
4894 Py_UNICODE_COPY(p, str2->str, str2->length);
4895 p += str2->length;
4896 if (--n <= 0)
4897 break;
4898 *p++ = self->str[i++];
Guido van Rossumd57fd912000-03-10 22:53:23 +00004899 }
Fredrik Lundh347ee272006-05-24 16:35:18 +00004900 Py_UNICODE_COPY(p, self->str+i, self->length-i);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004901 }
4902 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004903 return (PyObject *) u;
Fredrik Lundh347ee272006-05-24 16:35:18 +00004904
4905nothing:
4906 /* nothing to replace; return original string (when possible) */
4907 if (PyUnicode_CheckExact(self)) {
4908 Py_INCREF(self);
4909 return (PyObject *) self;
4910 }
4911 return PyUnicode_FromUnicode(self->str, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004912}
4913
4914/* --- Unicode Object Methods --------------------------------------------- */
4915
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004916PyDoc_STRVAR(title__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004917"S.title() -> unicode\n\
4918\n\
4919Return a titlecased version of S, i.e. words start with title case\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004920characters, all remaining cased characters have lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00004921
4922static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004923unicode_title(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004924{
Guido van Rossumd57fd912000-03-10 22:53:23 +00004925 return fixup(self, fixtitle);
4926}
4927
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004928PyDoc_STRVAR(capitalize__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004929"S.capitalize() -> unicode\n\
4930\n\
4931Return a capitalized version of S, i.e. make the first character\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004932have upper case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00004933
4934static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004935unicode_capitalize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004936{
Guido van Rossumd57fd912000-03-10 22:53:23 +00004937 return fixup(self, fixcapitalize);
4938}
4939
4940#if 0
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004941PyDoc_STRVAR(capwords__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004942"S.capwords() -> unicode\n\
4943\n\
4944Apply .capitalize() to all words in S and return the result with\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004945normalized whitespace (all whitespace strings are replaced by ' ').");
Guido van Rossumd57fd912000-03-10 22:53:23 +00004946
4947static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004948unicode_capwords(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004949{
4950 PyObject *list;
4951 PyObject *item;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004952 Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004953
Guido van Rossumd57fd912000-03-10 22:53:23 +00004954 /* Split into words */
4955 list = split(self, NULL, -1);
4956 if (!list)
4957 return NULL;
4958
4959 /* Capitalize each word */
4960 for (i = 0; i < PyList_GET_SIZE(list); i++) {
4961 item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
4962 fixcapitalize);
4963 if (item == NULL)
4964 goto onError;
4965 Py_DECREF(PyList_GET_ITEM(list, i));
4966 PyList_SET_ITEM(list, i, item);
4967 }
4968
4969 /* Join the words to form a new string */
4970 item = PyUnicode_Join(NULL, list);
4971
4972onError:
4973 Py_DECREF(list);
4974 return (PyObject *)item;
4975}
4976#endif
4977
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00004978/* Argument converter. Coerces to a single unicode character */
4979
4980static int
4981convert_uc(PyObject *obj, void *addr)
4982{
4983 Py_UNICODE *fillcharloc = (Py_UNICODE *)addr;
4984 PyObject *uniobj;
4985 Py_UNICODE *unistr;
4986
4987 uniobj = PyUnicode_FromObject(obj);
4988 if (uniobj == NULL) {
4989 PyErr_SetString(PyExc_TypeError,
4990 "The fill character cannot be converted to Unicode");
4991 return 0;
4992 }
4993 if (PyUnicode_GET_SIZE(uniobj) != 1) {
4994 PyErr_SetString(PyExc_TypeError,
4995 "The fill character must be exactly one character long");
4996 Py_DECREF(uniobj);
4997 return 0;
4998 }
4999 unistr = PyUnicode_AS_UNICODE(uniobj);
5000 *fillcharloc = unistr[0];
5001 Py_DECREF(uniobj);
5002 return 1;
5003}
5004
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005005PyDoc_STRVAR(center__doc__,
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00005006"S.center(width[, fillchar]) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005007\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00005008Return S centered in a Unicode string of length width. Padding is\n\
5009done using the specified fill character (default is a space)");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005010
5011static PyObject *
5012unicode_center(PyUnicodeObject *self, PyObject *args)
5013{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005014 Py_ssize_t marg, left;
5015 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00005016 Py_UNICODE fillchar = ' ';
Guido van Rossumd57fd912000-03-10 22:53:23 +00005017
Thomas Woutersde017742006-02-16 19:34:37 +00005018 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005019 return NULL;
5020
Tim Peters7a29bd52001-09-12 03:03:31 +00005021 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005022 Py_INCREF(self);
5023 return (PyObject*) self;
5024 }
5025
5026 marg = width - self->length;
5027 left = marg / 2 + (marg & width & 1);
5028
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00005029 return (PyObject*) pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005030}
5031
Marc-André Lemburge5034372000-08-08 08:04:29 +00005032#if 0
5033
5034/* This code should go into some future Unicode collation support
5035 module. The basic comparison should compare ordinals on a naive
Trent Mick20abf572000-08-12 22:14:34 +00005036 basis (this is what Java does and thus JPython too). */
Marc-André Lemburge5034372000-08-08 08:04:29 +00005037
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005038/* speedy UTF-16 code point order comparison */
5039/* gleaned from: */
5040/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
5041
Marc-André Lemburge12896e2000-07-07 17:51:08 +00005042static short utf16Fixup[32] =
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005043{
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005044 0, 0, 0, 0, 0, 0, 0, 0,
Tim Petersced69f82003-09-16 20:30:58 +00005045 0, 0, 0, 0, 0, 0, 0, 0,
5046 0, 0, 0, 0, 0, 0, 0, 0,
Marc-André Lemburge12896e2000-07-07 17:51:08 +00005047 0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005048};
5049
Guido van Rossumd57fd912000-03-10 22:53:23 +00005050static int
5051unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
5052{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005053 Py_ssize_t len1, len2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005054
Guido van Rossumd57fd912000-03-10 22:53:23 +00005055 Py_UNICODE *s1 = str1->str;
5056 Py_UNICODE *s2 = str2->str;
5057
5058 len1 = str1->length;
5059 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00005060
Guido van Rossumd57fd912000-03-10 22:53:23 +00005061 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00005062 Py_UNICODE c1, c2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005063
5064 c1 = *s1++;
5065 c2 = *s2++;
Fredrik Lundh45714e92001-06-26 16:39:36 +00005066
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005067 if (c1 > (1<<11) * 26)
5068 c1 += utf16Fixup[c1>>11];
5069 if (c2 > (1<<11) * 26)
5070 c2 += utf16Fixup[c2>>11];
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005071 /* now c1 and c2 are in UTF-32-compatible order */
Fredrik Lundh45714e92001-06-26 16:39:36 +00005072
5073 if (c1 != c2)
5074 return (c1 < c2) ? -1 : 1;
Tim Petersced69f82003-09-16 20:30:58 +00005075
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005076 len1--; len2--;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005077 }
5078
5079 return (len1 < len2) ? -1 : (len1 != len2);
5080}
5081
Marc-André Lemburge5034372000-08-08 08:04:29 +00005082#else
5083
5084static int
5085unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
5086{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005087 register Py_ssize_t len1, len2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00005088
5089 Py_UNICODE *s1 = str1->str;
5090 Py_UNICODE *s2 = str2->str;
5091
5092 len1 = str1->length;
5093 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00005094
Marc-André Lemburge5034372000-08-08 08:04:29 +00005095 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00005096 Py_UNICODE c1, c2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00005097
Fredrik Lundh45714e92001-06-26 16:39:36 +00005098 c1 = *s1++;
5099 c2 = *s2++;
5100
5101 if (c1 != c2)
5102 return (c1 < c2) ? -1 : 1;
5103
Marc-André Lemburge5034372000-08-08 08:04:29 +00005104 len1--; len2--;
5105 }
5106
5107 return (len1 < len2) ? -1 : (len1 != len2);
5108}
5109
5110#endif
5111
Guido van Rossumd57fd912000-03-10 22:53:23 +00005112int PyUnicode_Compare(PyObject *left,
5113 PyObject *right)
5114{
5115 PyUnicodeObject *u = NULL, *v = NULL;
5116 int result;
5117
5118 /* Coerce the two arguments */
5119 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
5120 if (u == NULL)
5121 goto onError;
5122 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
5123 if (v == NULL)
5124 goto onError;
5125
Thomas Wouters7e474022000-07-16 12:04:32 +00005126 /* Shortcut for empty or interned objects */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005127 if (v == u) {
5128 Py_DECREF(u);
5129 Py_DECREF(v);
5130 return 0;
5131 }
5132
5133 result = unicode_compare(u, v);
5134
5135 Py_DECREF(u);
5136 Py_DECREF(v);
5137 return result;
5138
5139onError:
5140 Py_XDECREF(u);
5141 Py_XDECREF(v);
5142 return -1;
5143}
5144
Guido van Rossum403d68b2000-03-13 15:55:09 +00005145int PyUnicode_Contains(PyObject *container,
5146 PyObject *element)
5147{
Fredrik Lundh833bf942006-05-23 10:12:21 +00005148 PyUnicodeObject *u, *v;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005149 int result;
5150 Py_ssize_t size;
Fredrik Lundhd5e0dc52006-05-24 15:11:01 +00005151#ifdef USE_FAST
5152 Py_ssize_t pos;
5153#endif
Guido van Rossum403d68b2000-03-13 15:55:09 +00005154
5155 /* Coerce the two arguments */
Fredrik Lundh833bf942006-05-23 10:12:21 +00005156 v = (PyUnicodeObject *) PyUnicode_FromObject(element);
5157 if (!v) {
Marc-André Lemburg7c014682000-06-28 08:11:47 +00005158 PyErr_SetString(PyExc_TypeError,
Barry Warsaw817918c2002-08-06 16:58:21 +00005159 "'in <string>' requires string as left operand");
Fredrik Lundh833bf942006-05-23 10:12:21 +00005160 return -1;
Marc-André Lemburg7c014682000-06-28 08:11:47 +00005161 }
Fredrik Lundh833bf942006-05-23 10:12:21 +00005162
5163 u = (PyUnicodeObject *) PyUnicode_FromObject(container);
5164 if (!u) {
5165 Py_DECREF(v);
5166 return -1;
5167 }
Guido van Rossum403d68b2000-03-13 15:55:09 +00005168
Barry Warsaw817918c2002-08-06 16:58:21 +00005169 size = PyUnicode_GET_SIZE(v);
Fredrik Lundh833bf942006-05-23 10:12:21 +00005170 if (!size) {
5171 result = 1;
5172 goto done;
5173 }
Barry Warsaw817918c2002-08-06 16:58:21 +00005174
Fredrik Lundhd5e0dc52006-05-24 15:11:01 +00005175#ifdef USE_FAST
5176 pos = fastsearch(
5177 PyUnicode_AS_UNICODE(u), PyUnicode_GET_SIZE(u),
5178 PyUnicode_AS_UNICODE(v), size, FAST_SEARCH
5179 );
5180 result = (pos != -1);
5181#else
Guido van Rossum403d68b2000-03-13 15:55:09 +00005182 result = 0;
Fredrik Lundh833bf942006-05-23 10:12:21 +00005183
Barry Warsaw817918c2002-08-06 16:58:21 +00005184 if (size == 1) {
Fredrik Lundh833bf942006-05-23 10:12:21 +00005185 Py_UNICODE chr = PyUnicode_AS_UNICODE(v)[0];
5186 Py_UNICODE* ptr = PyUnicode_AS_UNICODE(u);
5187 Py_UNICODE* end = ptr + PyUnicode_GET_SIZE(u);
5188 for (; ptr < end; ptr++) {
5189 if (*ptr == chr) {
Barry Warsaw817918c2002-08-06 16:58:21 +00005190 result = 1;
5191 break;
5192 }
5193 }
Fredrik Lundh833bf942006-05-23 10:12:21 +00005194 } else {
Fredrik Lundh240bf2a2006-05-24 10:20:36 +00005195 Py_ssize_t start = 0;
5196 Py_ssize_t end = PyUnicode_GET_SIZE(u) - size;
Fredrik Lundh833bf942006-05-23 10:12:21 +00005197 for (; start <= end; start++)
5198 if (Py_UNICODE_MATCH(u, start, v)) {
5199 result = 1;
5200 break;
5201 }
Guido van Rossum403d68b2000-03-13 15:55:09 +00005202 }
Fredrik Lundhd5e0dc52006-05-24 15:11:01 +00005203#endif
Guido van Rossum403d68b2000-03-13 15:55:09 +00005204
Fredrik Lundh833bf942006-05-23 10:12:21 +00005205done:
Guido van Rossum403d68b2000-03-13 15:55:09 +00005206 Py_DECREF(u);
5207 Py_DECREF(v);
5208 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00005209}
5210
Guido van Rossumd57fd912000-03-10 22:53:23 +00005211/* Concat to string or Unicode object giving a new Unicode object. */
5212
5213PyObject *PyUnicode_Concat(PyObject *left,
5214 PyObject *right)
5215{
5216 PyUnicodeObject *u = NULL, *v = NULL, *w;
5217
5218 /* Coerce the two arguments */
5219 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
5220 if (u == NULL)
5221 goto onError;
5222 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
5223 if (v == NULL)
5224 goto onError;
5225
5226 /* Shortcuts */
5227 if (v == unicode_empty) {
5228 Py_DECREF(v);
5229 return (PyObject *)u;
5230 }
5231 if (u == unicode_empty) {
5232 Py_DECREF(u);
5233 return (PyObject *)v;
5234 }
5235
5236 /* Concat the two Unicode strings */
5237 w = _PyUnicode_New(u->length + v->length);
5238 if (w == NULL)
5239 goto onError;
5240 Py_UNICODE_COPY(w->str, u->str, u->length);
5241 Py_UNICODE_COPY(w->str + u->length, v->str, v->length);
5242
5243 Py_DECREF(u);
5244 Py_DECREF(v);
5245 return (PyObject *)w;
5246
5247onError:
5248 Py_XDECREF(u);
5249 Py_XDECREF(v);
5250 return NULL;
5251}
5252
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005253PyDoc_STRVAR(count__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005254"S.count(sub[, start[, end]]) -> int\n\
5255\n\
Fredrik Lundh763b50f2006-05-22 15:35:12 +00005256Return the number of non-overlapping occurrences of substring sub in\n\
5257Unicode string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005258interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005259
5260static PyObject *
5261unicode_count(PyUnicodeObject *self, PyObject *args)
5262{
5263 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005264 Py_ssize_t start = 0;
Martin v. Löwis412fb672006-04-13 06:34:32 +00005265 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005266 PyObject *result;
5267
Guido van Rossumb8872e62000-05-09 14:14:27 +00005268 if (!PyArg_ParseTuple(args, "O|O&O&:count", &substring,
5269 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005270 return NULL;
5271
5272 substring = (PyUnicodeObject *)PyUnicode_FromObject(
5273 (PyObject *)substring);
5274 if (substring == NULL)
5275 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +00005276
Guido van Rossumd57fd912000-03-10 22:53:23 +00005277 if (start < 0)
5278 start += self->length;
5279 if (start < 0)
5280 start = 0;
5281 if (end > self->length)
5282 end = self->length;
5283 if (end < 0)
5284 end += self->length;
5285 if (end < 0)
5286 end = 0;
5287
5288 result = PyInt_FromLong((long) count(self, start, end, substring));
5289
5290 Py_DECREF(substring);
5291 return result;
5292}
5293
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005294PyDoc_STRVAR(encode__doc__,
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00005295"S.encode([encoding[,errors]]) -> string or unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005296\n\
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00005297Encodes S using the codec registered for encoding. encoding defaults\n\
5298to the default encoding. errors may be given to set a different error\n\
Fred Drakee4315f52000-05-09 19:53:39 +00005299handling scheme. Default is 'strict' meaning that encoding errors raise\n\
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005300a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
5301'xmlcharrefreplace' as well as any other name registered with\n\
5302codecs.register_error that can handle UnicodeEncodeErrors.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005303
5304static PyObject *
5305unicode_encode(PyUnicodeObject *self, PyObject *args)
5306{
5307 char *encoding = NULL;
5308 char *errors = NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00005309 PyObject *v;
5310
Guido van Rossumd57fd912000-03-10 22:53:23 +00005311 if (!PyArg_ParseTuple(args, "|ss:encode", &encoding, &errors))
5312 return NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00005313 v = PyUnicode_AsEncodedObject((PyObject *)self, encoding, errors);
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00005314 if (v == NULL)
5315 goto onError;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00005316 if (!PyString_Check(v) && !PyUnicode_Check(v)) {
5317 PyErr_Format(PyExc_TypeError,
5318 "encoder did not return a string/unicode object "
5319 "(type=%.400s)",
5320 v->ob_type->tp_name);
5321 Py_DECREF(v);
5322 return NULL;
5323 }
5324 return v;
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00005325
5326 onError:
5327 return NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00005328}
5329
5330PyDoc_STRVAR(decode__doc__,
5331"S.decode([encoding[,errors]]) -> string or unicode\n\
5332\n\
5333Decodes S using the codec registered for encoding. encoding defaults\n\
5334to the default encoding. errors may be given to set a different error\n\
5335handling scheme. Default is 'strict' meaning that encoding errors raise\n\
5336a UnicodeDecodeError. Other possible values are 'ignore' and 'replace'\n\
5337as well as any other name registerd with codecs.register_error that is\n\
5338able to handle UnicodeDecodeErrors.");
5339
5340static PyObject *
Marc-André Lemburg126b44c2004-07-10 12:04:20 +00005341unicode_decode(PyUnicodeObject *self, PyObject *args)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00005342{
5343 char *encoding = NULL;
5344 char *errors = NULL;
5345 PyObject *v;
5346
5347 if (!PyArg_ParseTuple(args, "|ss:decode", &encoding, &errors))
5348 return NULL;
5349 v = PyUnicode_AsDecodedObject((PyObject *)self, encoding, errors);
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00005350 if (v == NULL)
5351 goto onError;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00005352 if (!PyString_Check(v) && !PyUnicode_Check(v)) {
5353 PyErr_Format(PyExc_TypeError,
5354 "decoder did not return a string/unicode object "
5355 "(type=%.400s)",
5356 v->ob_type->tp_name);
5357 Py_DECREF(v);
5358 return NULL;
5359 }
5360 return v;
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00005361
5362 onError:
5363 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005364}
5365
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005366PyDoc_STRVAR(expandtabs__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005367"S.expandtabs([tabsize]) -> unicode\n\
5368\n\
5369Return a copy of S where all tab characters are expanded using spaces.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005370If tabsize is not given, a tab size of 8 characters is assumed.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005371
5372static PyObject*
5373unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
5374{
5375 Py_UNICODE *e;
5376 Py_UNICODE *p;
5377 Py_UNICODE *q;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005378 Py_ssize_t i, j;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005379 PyUnicodeObject *u;
5380 int tabsize = 8;
5381
5382 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
5383 return NULL;
5384
Thomas Wouters7e474022000-07-16 12:04:32 +00005385 /* First pass: determine size of output string */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005386 i = j = 0;
5387 e = self->str + self->length;
5388 for (p = self->str; p < e; p++)
5389 if (*p == '\t') {
5390 if (tabsize > 0)
5391 j += tabsize - (j % tabsize);
5392 }
5393 else {
5394 j++;
5395 if (*p == '\n' || *p == '\r') {
5396 i += j;
5397 j = 0;
5398 }
5399 }
5400
5401 /* Second pass: create output string and fill it */
5402 u = _PyUnicode_New(i + j);
5403 if (!u)
5404 return NULL;
5405
5406 j = 0;
5407 q = u->str;
5408
5409 for (p = self->str; p < e; p++)
5410 if (*p == '\t') {
5411 if (tabsize > 0) {
5412 i = tabsize - (j % tabsize);
5413 j += i;
5414 while (i--)
5415 *q++ = ' ';
5416 }
5417 }
5418 else {
5419 j++;
5420 *q++ = *p;
5421 if (*p == '\n' || *p == '\r')
5422 j = 0;
5423 }
5424
5425 return (PyObject*) u;
5426}
5427
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005428PyDoc_STRVAR(find__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005429"S.find(sub [,start [,end]]) -> int\n\
5430\n\
5431Return the lowest index in S where substring sub is found,\n\
5432such that sub is contained within s[start,end]. Optional\n\
5433arguments start and end are interpreted as in slice notation.\n\
5434\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005435Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005436
5437static PyObject *
5438unicode_find(PyUnicodeObject *self, PyObject *args)
5439{
5440 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005441 Py_ssize_t start = 0;
Martin v. Löwis412fb672006-04-13 06:34:32 +00005442 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005443 PyObject *result;
5444
Guido van Rossumb8872e62000-05-09 14:14:27 +00005445 if (!PyArg_ParseTuple(args, "O|O&O&:find", &substring,
5446 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005447 return NULL;
5448 substring = (PyUnicodeObject *)PyUnicode_FromObject(
5449 (PyObject *)substring);
5450 if (substring == NULL)
5451 return NULL;
5452
Martin v. Löwis18e16552006-02-15 17:27:45 +00005453 result = PyInt_FromSsize_t(findstring(self, substring, start, end, 1));
Guido van Rossumd57fd912000-03-10 22:53:23 +00005454
5455 Py_DECREF(substring);
5456 return result;
5457}
5458
5459static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +00005460unicode_getitem(PyUnicodeObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005461{
5462 if (index < 0 || index >= self->length) {
5463 PyErr_SetString(PyExc_IndexError, "string index out of range");
5464 return NULL;
5465 }
5466
5467 return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1);
5468}
5469
5470static long
5471unicode_hash(PyUnicodeObject *self)
5472{
Fredrik Lundhdde61642000-07-10 18:27:47 +00005473 /* Since Unicode objects compare equal to their ASCII string
5474 counterparts, they should use the individual character values
5475 as basis for their hash value. This is needed to assure that
5476 strings and Unicode objects behave in the same way as
5477 dictionary keys. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005478
Martin v. Löwis18e16552006-02-15 17:27:45 +00005479 register Py_ssize_t len;
Fredrik Lundhdde61642000-07-10 18:27:47 +00005480 register Py_UNICODE *p;
5481 register long x;
5482
Guido van Rossumd57fd912000-03-10 22:53:23 +00005483 if (self->hash != -1)
5484 return self->hash;
Fredrik Lundhdde61642000-07-10 18:27:47 +00005485 len = PyUnicode_GET_SIZE(self);
5486 p = PyUnicode_AS_UNICODE(self);
5487 x = *p << 7;
5488 while (--len >= 0)
5489 x = (1000003*x) ^ *p++;
5490 x ^= PyUnicode_GET_SIZE(self);
5491 if (x == -1)
5492 x = -2;
5493 self->hash = x;
5494 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005495}
5496
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005497PyDoc_STRVAR(index__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005498"S.index(sub [,start [,end]]) -> int\n\
5499\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005500Like S.find() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005501
5502static PyObject *
5503unicode_index(PyUnicodeObject *self, PyObject *args)
5504{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005505 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005506 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005507 Py_ssize_t start = 0;
Martin v. Löwis412fb672006-04-13 06:34:32 +00005508 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005509
Guido van Rossumb8872e62000-05-09 14:14:27 +00005510 if (!PyArg_ParseTuple(args, "O|O&O&:index", &substring,
5511 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005512 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +00005513
Guido van Rossumd57fd912000-03-10 22:53:23 +00005514 substring = (PyUnicodeObject *)PyUnicode_FromObject(
5515 (PyObject *)substring);
5516 if (substring == NULL)
5517 return NULL;
5518
5519 result = findstring(self, substring, start, end, 1);
5520
5521 Py_DECREF(substring);
5522 if (result < 0) {
5523 PyErr_SetString(PyExc_ValueError, "substring not found");
5524 return NULL;
5525 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00005526 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005527}
5528
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005529PyDoc_STRVAR(islower__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005530"S.islower() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005531\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00005532Return True if all cased characters in S are lowercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005533at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005534
5535static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005536unicode_islower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005537{
5538 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5539 register const Py_UNICODE *e;
5540 int cased;
5541
Guido van Rossumd57fd912000-03-10 22:53:23 +00005542 /* Shortcut for single character strings */
5543 if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005544 return PyBool_FromLong(Py_UNICODE_ISLOWER(*p));
Guido van Rossumd57fd912000-03-10 22:53:23 +00005545
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005546 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00005547 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005548 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005549
Guido van Rossumd57fd912000-03-10 22:53:23 +00005550 e = p + PyUnicode_GET_SIZE(self);
5551 cased = 0;
5552 for (; p < e; p++) {
5553 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00005554
Guido van Rossumd57fd912000-03-10 22:53:23 +00005555 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005556 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005557 else if (!cased && Py_UNICODE_ISLOWER(ch))
5558 cased = 1;
5559 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00005560 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005561}
5562
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005563PyDoc_STRVAR(isupper__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005564"S.isupper() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005565\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00005566Return True if all cased characters in S are uppercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005567at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005568
5569static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005570unicode_isupper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005571{
5572 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5573 register const Py_UNICODE *e;
5574 int cased;
5575
Guido van Rossumd57fd912000-03-10 22:53:23 +00005576 /* Shortcut for single character strings */
5577 if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005578 return PyBool_FromLong(Py_UNICODE_ISUPPER(*p) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005579
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005580 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00005581 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005582 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005583
Guido van Rossumd57fd912000-03-10 22:53:23 +00005584 e = p + PyUnicode_GET_SIZE(self);
5585 cased = 0;
5586 for (; p < e; p++) {
5587 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00005588
Guido van Rossumd57fd912000-03-10 22:53:23 +00005589 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005590 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005591 else if (!cased && Py_UNICODE_ISUPPER(ch))
5592 cased = 1;
5593 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00005594 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005595}
5596
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005597PyDoc_STRVAR(istitle__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005598"S.istitle() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005599\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00005600Return True if S is a titlecased string and there is at least one\n\
5601character in S, i.e. upper- and titlecase characters may only\n\
5602follow uncased characters and lowercase characters only cased ones.\n\
5603Return False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005604
5605static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005606unicode_istitle(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005607{
5608 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5609 register const Py_UNICODE *e;
5610 int cased, previous_is_cased;
5611
Guido van Rossumd57fd912000-03-10 22:53:23 +00005612 /* Shortcut for single character strings */
5613 if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005614 return PyBool_FromLong((Py_UNICODE_ISTITLE(*p) != 0) ||
5615 (Py_UNICODE_ISUPPER(*p) != 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +00005616
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005617 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00005618 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005619 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005620
Guido van Rossumd57fd912000-03-10 22:53:23 +00005621 e = p + PyUnicode_GET_SIZE(self);
5622 cased = 0;
5623 previous_is_cased = 0;
5624 for (; p < e; p++) {
5625 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00005626
Guido van Rossumd57fd912000-03-10 22:53:23 +00005627 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
5628 if (previous_is_cased)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005629 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005630 previous_is_cased = 1;
5631 cased = 1;
5632 }
5633 else if (Py_UNICODE_ISLOWER(ch)) {
5634 if (!previous_is_cased)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005635 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005636 previous_is_cased = 1;
5637 cased = 1;
5638 }
5639 else
5640 previous_is_cased = 0;
5641 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00005642 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005643}
5644
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005645PyDoc_STRVAR(isspace__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005646"S.isspace() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005647\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00005648Return True if all characters in S are whitespace\n\
5649and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005650
5651static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005652unicode_isspace(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005653{
5654 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5655 register const Py_UNICODE *e;
5656
Guido van Rossumd57fd912000-03-10 22:53:23 +00005657 /* Shortcut for single character strings */
5658 if (PyUnicode_GET_SIZE(self) == 1 &&
5659 Py_UNICODE_ISSPACE(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005660 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005661
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005662 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00005663 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005664 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005665
Guido van Rossumd57fd912000-03-10 22:53:23 +00005666 e = p + PyUnicode_GET_SIZE(self);
5667 for (; p < e; p++) {
5668 if (!Py_UNICODE_ISSPACE(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005669 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005670 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00005671 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005672}
5673
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005674PyDoc_STRVAR(isalpha__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005675"S.isalpha() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005676\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00005677Return True if all characters in S are alphabetic\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005678and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005679
5680static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005681unicode_isalpha(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005682{
5683 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5684 register const Py_UNICODE *e;
5685
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005686 /* Shortcut for single character strings */
5687 if (PyUnicode_GET_SIZE(self) == 1 &&
5688 Py_UNICODE_ISALPHA(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005689 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005690
5691 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00005692 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005693 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005694
5695 e = p + PyUnicode_GET_SIZE(self);
5696 for (; p < e; p++) {
5697 if (!Py_UNICODE_ISALPHA(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005698 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005699 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00005700 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005701}
5702
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005703PyDoc_STRVAR(isalnum__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005704"S.isalnum() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005705\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00005706Return True if all characters in S are alphanumeric\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005707and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005708
5709static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005710unicode_isalnum(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005711{
5712 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5713 register const Py_UNICODE *e;
5714
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005715 /* Shortcut for single character strings */
5716 if (PyUnicode_GET_SIZE(self) == 1 &&
5717 Py_UNICODE_ISALNUM(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005718 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005719
5720 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00005721 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005722 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005723
5724 e = p + PyUnicode_GET_SIZE(self);
5725 for (; p < e; p++) {
5726 if (!Py_UNICODE_ISALNUM(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005727 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005728 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00005729 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005730}
5731
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005732PyDoc_STRVAR(isdecimal__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005733"S.isdecimal() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005734\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00005735Return True if there are only decimal characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005736False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005737
5738static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005739unicode_isdecimal(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005740{
5741 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5742 register const Py_UNICODE *e;
5743
Guido van Rossumd57fd912000-03-10 22:53:23 +00005744 /* Shortcut for single character strings */
5745 if (PyUnicode_GET_SIZE(self) == 1 &&
5746 Py_UNICODE_ISDECIMAL(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005747 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005748
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005749 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00005750 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005751 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005752
Guido van Rossumd57fd912000-03-10 22:53:23 +00005753 e = p + PyUnicode_GET_SIZE(self);
5754 for (; p < e; p++) {
5755 if (!Py_UNICODE_ISDECIMAL(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005756 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005757 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00005758 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005759}
5760
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005761PyDoc_STRVAR(isdigit__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005762"S.isdigit() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005763\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00005764Return True if all characters in S are digits\n\
5765and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005766
5767static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005768unicode_isdigit(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005769{
5770 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5771 register const Py_UNICODE *e;
5772
Guido van Rossumd57fd912000-03-10 22:53:23 +00005773 /* Shortcut for single character strings */
5774 if (PyUnicode_GET_SIZE(self) == 1 &&
5775 Py_UNICODE_ISDIGIT(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005776 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005777
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005778 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00005779 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005780 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005781
Guido van Rossumd57fd912000-03-10 22:53:23 +00005782 e = p + PyUnicode_GET_SIZE(self);
5783 for (; p < e; p++) {
5784 if (!Py_UNICODE_ISDIGIT(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005785 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005786 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00005787 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005788}
5789
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005790PyDoc_STRVAR(isnumeric__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005791"S.isnumeric() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005792\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00005793Return True if there are only numeric characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005794False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005795
5796static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005797unicode_isnumeric(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005798{
5799 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5800 register const Py_UNICODE *e;
5801
Guido van Rossumd57fd912000-03-10 22:53:23 +00005802 /* Shortcut for single character strings */
5803 if (PyUnicode_GET_SIZE(self) == 1 &&
5804 Py_UNICODE_ISNUMERIC(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005805 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005806
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005807 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00005808 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005809 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005810
Guido van Rossumd57fd912000-03-10 22:53:23 +00005811 e = p + PyUnicode_GET_SIZE(self);
5812 for (; p < e; p++) {
5813 if (!Py_UNICODE_ISNUMERIC(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005814 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005815 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00005816 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005817}
5818
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005819PyDoc_STRVAR(join__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005820"S.join(sequence) -> unicode\n\
5821\n\
5822Return a string which is the concatenation of the strings in the\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005823sequence. The separator between elements is S.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005824
5825static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005826unicode_join(PyObject *self, PyObject *data)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005827{
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005828 return PyUnicode_Join(self, data);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005829}
5830
Martin v. Löwis18e16552006-02-15 17:27:45 +00005831static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +00005832unicode_length(PyUnicodeObject *self)
5833{
5834 return self->length;
5835}
5836
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005837PyDoc_STRVAR(ljust__doc__,
Hye-Shik Chang974ed7c2004-06-02 16:49:17 +00005838"S.ljust(width[, fillchar]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005839\n\
5840Return S left justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00005841done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005842
5843static PyObject *
5844unicode_ljust(PyUnicodeObject *self, PyObject *args)
5845{
Martin v. Löwis412fb672006-04-13 06:34:32 +00005846 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00005847 Py_UNICODE fillchar = ' ';
5848
Martin v. Löwis412fb672006-04-13 06:34:32 +00005849 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005850 return NULL;
5851
Tim Peters7a29bd52001-09-12 03:03:31 +00005852 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005853 Py_INCREF(self);
5854 return (PyObject*) self;
5855 }
5856
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00005857 return (PyObject*) pad(self, 0, width - self->length, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005858}
5859
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005860PyDoc_STRVAR(lower__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005861"S.lower() -> unicode\n\
5862\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005863Return a copy of the string S converted to lowercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005864
5865static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005866unicode_lower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005867{
Guido van Rossumd57fd912000-03-10 22:53:23 +00005868 return fixup(self, fixlower);
5869}
5870
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005871#define LEFTSTRIP 0
5872#define RIGHTSTRIP 1
5873#define BOTHSTRIP 2
5874
5875/* Arrays indexed by above */
5876static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
5877
5878#define STRIPNAME(i) (stripformat[i]+3)
5879
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005880/* externally visible for str.strip(unicode) */
5881PyObject *
5882_PyUnicode_XStrip(PyUnicodeObject *self, int striptype, PyObject *sepobj)
5883{
5884 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
Martin v. Löwis18e16552006-02-15 17:27:45 +00005885 Py_ssize_t len = PyUnicode_GET_SIZE(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005886 Py_UNICODE *sep = PyUnicode_AS_UNICODE(sepobj);
Martin v. Löwis18e16552006-02-15 17:27:45 +00005887 Py_ssize_t seplen = PyUnicode_GET_SIZE(sepobj);
5888 Py_ssize_t i, j;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005889
Fredrik Lundhb63588c2006-05-23 18:44:25 +00005890 BLOOM_MASK sepmask = make_bloom_mask(sep, seplen);
5891
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005892 i = 0;
5893 if (striptype != RIGHTSTRIP) {
Fredrik Lundhb63588c2006-05-23 18:44:25 +00005894 while (i < len && BLOOM_MEMBER(sepmask, s[i], sep, seplen)) {
5895 i++;
5896 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005897 }
5898
5899 j = len;
5900 if (striptype != LEFTSTRIP) {
Fredrik Lundhb63588c2006-05-23 18:44:25 +00005901 do {
5902 j--;
5903 } while (j >= i && BLOOM_MEMBER(sepmask, s[j], sep, seplen));
5904 j++;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005905 }
5906
5907 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
Fredrik Lundhb63588c2006-05-23 18:44:25 +00005908 Py_INCREF(self);
5909 return (PyObject*)self;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005910 }
5911 else
Fredrik Lundhb63588c2006-05-23 18:44:25 +00005912 return PyUnicode_FromUnicode(s+i, j-i);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005913}
5914
Guido van Rossumd57fd912000-03-10 22:53:23 +00005915
5916static PyObject *
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005917do_strip(PyUnicodeObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005918{
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005919 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
Martin v. Löwis18e16552006-02-15 17:27:45 +00005920 Py_ssize_t len = PyUnicode_GET_SIZE(self), i, j;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005921
5922 i = 0;
5923 if (striptype != RIGHTSTRIP) {
5924 while (i < len && Py_UNICODE_ISSPACE(s[i])) {
5925 i++;
5926 }
5927 }
5928
5929 j = len;
5930 if (striptype != LEFTSTRIP) {
5931 do {
5932 j--;
5933 } while (j >= i && Py_UNICODE_ISSPACE(s[j]));
5934 j++;
5935 }
5936
5937 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
5938 Py_INCREF(self);
5939 return (PyObject*)self;
5940 }
5941 else
5942 return PyUnicode_FromUnicode(s+i, j-i);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005943}
5944
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005945
5946static PyObject *
5947do_argstrip(PyUnicodeObject *self, int striptype, PyObject *args)
5948{
5949 PyObject *sep = NULL;
5950
5951 if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
5952 return NULL;
5953
5954 if (sep != NULL && sep != Py_None) {
5955 if (PyUnicode_Check(sep))
5956 return _PyUnicode_XStrip(self, striptype, sep);
5957 else if (PyString_Check(sep)) {
5958 PyObject *res;
5959 sep = PyUnicode_FromObject(sep);
5960 if (sep==NULL)
5961 return NULL;
5962 res = _PyUnicode_XStrip(self, striptype, sep);
5963 Py_DECREF(sep);
5964 return res;
5965 }
5966 else {
5967 PyErr_Format(PyExc_TypeError,
5968 "%s arg must be None, unicode or str",
5969 STRIPNAME(striptype));
5970 return NULL;
5971 }
5972 }
5973
5974 return do_strip(self, striptype);
5975}
5976
5977
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005978PyDoc_STRVAR(strip__doc__,
Neal Norwitzffe33b72003-04-10 22:35:32 +00005979"S.strip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005980\n\
5981Return a copy of the string S with leading and trailing\n\
5982whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00005983If chars is given and not None, remove characters in chars instead.\n\
5984If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005985
5986static PyObject *
5987unicode_strip(PyUnicodeObject *self, PyObject *args)
5988{
5989 if (PyTuple_GET_SIZE(args) == 0)
5990 return do_strip(self, BOTHSTRIP); /* Common case */
5991 else
5992 return do_argstrip(self, BOTHSTRIP, args);
5993}
5994
5995
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005996PyDoc_STRVAR(lstrip__doc__,
Neal Norwitzffe33b72003-04-10 22:35:32 +00005997"S.lstrip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005998\n\
5999Return a copy of the string S with leading whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00006000If chars is given and not None, remove characters in chars instead.\n\
6001If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006002
6003static PyObject *
6004unicode_lstrip(PyUnicodeObject *self, PyObject *args)
6005{
6006 if (PyTuple_GET_SIZE(args) == 0)
6007 return do_strip(self, LEFTSTRIP); /* Common case */
6008 else
6009 return do_argstrip(self, LEFTSTRIP, args);
6010}
6011
6012
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006013PyDoc_STRVAR(rstrip__doc__,
Neal Norwitzffe33b72003-04-10 22:35:32 +00006014"S.rstrip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006015\n\
6016Return a copy of the string S with trailing whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00006017If chars is given and not None, remove characters in chars instead.\n\
6018If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006019
6020static PyObject *
6021unicode_rstrip(PyUnicodeObject *self, PyObject *args)
6022{
6023 if (PyTuple_GET_SIZE(args) == 0)
6024 return do_strip(self, RIGHTSTRIP); /* Common case */
6025 else
6026 return do_argstrip(self, RIGHTSTRIP, args);
6027}
6028
6029
Guido van Rossumd57fd912000-03-10 22:53:23 +00006030static PyObject*
Martin v. Löwis18e16552006-02-15 17:27:45 +00006031unicode_repeat(PyUnicodeObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006032{
6033 PyUnicodeObject *u;
6034 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006035 Py_ssize_t nchars;
Tim Peters8f422462000-09-09 06:13:41 +00006036 size_t nbytes;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006037
6038 if (len < 0)
6039 len = 0;
6040
Tim Peters7a29bd52001-09-12 03:03:31 +00006041 if (len == 1 && PyUnicode_CheckExact(str)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006042 /* no repeat, return original string */
6043 Py_INCREF(str);
6044 return (PyObject*) str;
6045 }
Tim Peters8f422462000-09-09 06:13:41 +00006046
6047 /* ensure # of chars needed doesn't overflow int and # of bytes
6048 * needed doesn't overflow size_t
6049 */
6050 nchars = len * str->length;
6051 if (len && nchars / len != str->length) {
6052 PyErr_SetString(PyExc_OverflowError,
6053 "repeated string is too long");
6054 return NULL;
6055 }
6056 nbytes = (nchars + 1) * sizeof(Py_UNICODE);
6057 if (nbytes / sizeof(Py_UNICODE) != (size_t)(nchars + 1)) {
6058 PyErr_SetString(PyExc_OverflowError,
6059 "repeated string is too long");
6060 return NULL;
6061 }
6062 u = _PyUnicode_New(nchars);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006063 if (!u)
6064 return NULL;
6065
6066 p = u->str;
6067
Fredrik Lundhf1d60a52006-05-22 16:29:30 +00006068 if (str->length == 1 && len > 0) {
6069 Py_UNICODE_FILL(p, str->str[0], len);
Fredrik Lundh8a8e05a2006-05-22 17:12:58 +00006070 } else {
Tim Peters1bacc642006-05-23 05:47:16 +00006071 Py_ssize_t done = 0; /* number of characters copied this far */
Fredrik Lundh8a8e05a2006-05-22 17:12:58 +00006072 if (done < nchars) {
Fredrik Lundhf1d60a52006-05-22 16:29:30 +00006073 Py_UNICODE_COPY(p, str->str, str->length);
Fredrik Lundh8a8e05a2006-05-22 17:12:58 +00006074 done = str->length;
6075 }
6076 while (done < nchars) {
6077 int n = (done <= nchars-done) ? done : nchars-done;
6078 Py_UNICODE_COPY(p+done, p, n);
6079 done += n;
6080 }
6081 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006082
6083 return (PyObject*) u;
6084}
6085
6086PyObject *PyUnicode_Replace(PyObject *obj,
6087 PyObject *subobj,
6088 PyObject *replobj,
Martin v. Löwis18e16552006-02-15 17:27:45 +00006089 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006090{
6091 PyObject *self;
6092 PyObject *str1;
6093 PyObject *str2;
6094 PyObject *result;
6095
6096 self = PyUnicode_FromObject(obj);
6097 if (self == NULL)
6098 return NULL;
6099 str1 = PyUnicode_FromObject(subobj);
6100 if (str1 == NULL) {
6101 Py_DECREF(self);
6102 return NULL;
6103 }
6104 str2 = PyUnicode_FromObject(replobj);
6105 if (str2 == NULL) {
6106 Py_DECREF(self);
6107 Py_DECREF(str1);
6108 return NULL;
6109 }
Tim Petersced69f82003-09-16 20:30:58 +00006110 result = replace((PyUnicodeObject *)self,
6111 (PyUnicodeObject *)str1,
6112 (PyUnicodeObject *)str2,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006113 maxcount);
6114 Py_DECREF(self);
6115 Py_DECREF(str1);
6116 Py_DECREF(str2);
6117 return result;
6118}
6119
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006120PyDoc_STRVAR(replace__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006121"S.replace (old, new[, maxsplit]) -> unicode\n\
6122\n\
6123Return a copy of S with all occurrences of substring\n\
6124old replaced by new. If the optional argument maxsplit is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006125given, only the first maxsplit occurrences are replaced.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006126
6127static PyObject*
6128unicode_replace(PyUnicodeObject *self, PyObject *args)
6129{
6130 PyUnicodeObject *str1;
6131 PyUnicodeObject *str2;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006132 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006133 PyObject *result;
6134
Martin v. Löwis18e16552006-02-15 17:27:45 +00006135 if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006136 return NULL;
6137 str1 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str1);
6138 if (str1 == NULL)
6139 return NULL;
6140 str2 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str2);
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00006141 if (str2 == NULL) {
6142 Py_DECREF(str1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006143 return NULL;
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00006144 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006145
6146 result = replace(self, str1, str2, maxcount);
6147
6148 Py_DECREF(str1);
6149 Py_DECREF(str2);
6150 return result;
6151}
6152
6153static
6154PyObject *unicode_repr(PyObject *unicode)
6155{
6156 return unicodeescape_string(PyUnicode_AS_UNICODE(unicode),
6157 PyUnicode_GET_SIZE(unicode),
6158 1);
6159}
6160
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006161PyDoc_STRVAR(rfind__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006162"S.rfind(sub [,start [,end]]) -> int\n\
6163\n\
6164Return the highest index in S where substring sub is found,\n\
6165such that sub is contained within s[start,end]. Optional\n\
6166arguments start and end are interpreted as in slice notation.\n\
6167\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006168Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006169
6170static PyObject *
6171unicode_rfind(PyUnicodeObject *self, PyObject *args)
6172{
6173 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006174 Py_ssize_t start = 0;
Martin v. Löwis412fb672006-04-13 06:34:32 +00006175 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006176 PyObject *result;
6177
Guido van Rossumb8872e62000-05-09 14:14:27 +00006178 if (!PyArg_ParseTuple(args, "O|O&O&:rfind", &substring,
6179 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006180 return NULL;
6181 substring = (PyUnicodeObject *)PyUnicode_FromObject(
6182 (PyObject *)substring);
6183 if (substring == NULL)
6184 return NULL;
6185
Martin v. Löwis18e16552006-02-15 17:27:45 +00006186 result = PyInt_FromSsize_t(findstring(self, substring, start, end, -1));
Guido van Rossumd57fd912000-03-10 22:53:23 +00006187
6188 Py_DECREF(substring);
6189 return result;
6190}
6191
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006192PyDoc_STRVAR(rindex__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006193"S.rindex(sub [,start [,end]]) -> int\n\
6194\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006195Like S.rfind() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006196
6197static PyObject *
6198unicode_rindex(PyUnicodeObject *self, PyObject *args)
6199{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006200 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006201 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006202 Py_ssize_t start = 0;
Martin v. Löwis412fb672006-04-13 06:34:32 +00006203 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006204
Guido van Rossumb8872e62000-05-09 14:14:27 +00006205 if (!PyArg_ParseTuple(args, "O|O&O&:rindex", &substring,
6206 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006207 return NULL;
6208 substring = (PyUnicodeObject *)PyUnicode_FromObject(
6209 (PyObject *)substring);
6210 if (substring == NULL)
6211 return NULL;
6212
6213 result = findstring(self, substring, start, end, -1);
6214
6215 Py_DECREF(substring);
6216 if (result < 0) {
6217 PyErr_SetString(PyExc_ValueError, "substring not found");
6218 return NULL;
6219 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00006220 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006221}
6222
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006223PyDoc_STRVAR(rjust__doc__,
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006224"S.rjust(width[, fillchar]) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006225\n\
6226Return S right justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006227done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006228
6229static PyObject *
6230unicode_rjust(PyUnicodeObject *self, PyObject *args)
6231{
Martin v. Löwis412fb672006-04-13 06:34:32 +00006232 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006233 Py_UNICODE fillchar = ' ';
6234
Martin v. Löwis412fb672006-04-13 06:34:32 +00006235 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006236 return NULL;
6237
Tim Peters7a29bd52001-09-12 03:03:31 +00006238 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006239 Py_INCREF(self);
6240 return (PyObject*) self;
6241 }
6242
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006243 return (PyObject*) pad(self, width - self->length, 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006244}
6245
Guido van Rossumd57fd912000-03-10 22:53:23 +00006246static PyObject*
Martin v. Löwis18e16552006-02-15 17:27:45 +00006247unicode_slice(PyUnicodeObject *self, Py_ssize_t start, Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006248{
6249 /* standard clamping */
6250 if (start < 0)
6251 start = 0;
6252 if (end < 0)
6253 end = 0;
6254 if (end > self->length)
6255 end = self->length;
Tim Peters7a29bd52001-09-12 03:03:31 +00006256 if (start == 0 && end == self->length && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006257 /* full slice, return original string */
6258 Py_INCREF(self);
6259 return (PyObject*) self;
6260 }
6261 if (start > end)
6262 start = end;
6263 /* copy slice */
6264 return (PyObject*) PyUnicode_FromUnicode(self->str + start,
6265 end - start);
6266}
6267
6268PyObject *PyUnicode_Split(PyObject *s,
6269 PyObject *sep,
Martin v. Löwis18e16552006-02-15 17:27:45 +00006270 Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006271{
6272 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00006273
Guido van Rossumd57fd912000-03-10 22:53:23 +00006274 s = PyUnicode_FromObject(s);
6275 if (s == NULL)
6276 return NULL;
6277 if (sep != NULL) {
6278 sep = PyUnicode_FromObject(sep);
6279 if (sep == NULL) {
6280 Py_DECREF(s);
6281 return NULL;
6282 }
6283 }
6284
6285 result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
6286
6287 Py_DECREF(s);
6288 Py_XDECREF(sep);
6289 return result;
6290}
6291
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006292PyDoc_STRVAR(split__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006293"S.split([sep [,maxsplit]]) -> list of strings\n\
6294\n\
6295Return a list of the words in S, using sep as the\n\
6296delimiter string. If maxsplit is given, at most maxsplit\n\
Thomas Hellerca0d2cb2004-09-15 11:41:32 +00006297splits are done. If sep is not specified or is None,\n\
Walter Dörwald782afc52004-09-14 09:40:45 +00006298any whitespace string is a separator.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006299
6300static PyObject*
6301unicode_split(PyUnicodeObject *self, PyObject *args)
6302{
6303 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006304 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006305
Martin v. Löwis18e16552006-02-15 17:27:45 +00006306 if (!PyArg_ParseTuple(args, "|On:split", &substring, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006307 return NULL;
6308
6309 if (substring == Py_None)
6310 return split(self, NULL, maxcount);
6311 else if (PyUnicode_Check(substring))
6312 return split(self, (PyUnicodeObject *)substring, maxcount);
6313 else
6314 return PyUnicode_Split((PyObject *)self, substring, maxcount);
6315}
6316
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006317PyObject *PyUnicode_RSplit(PyObject *s,
6318 PyObject *sep,
Martin v. Löwis18e16552006-02-15 17:27:45 +00006319 Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006320{
6321 PyObject *result;
6322
6323 s = PyUnicode_FromObject(s);
6324 if (s == NULL)
6325 return NULL;
6326 if (sep != NULL) {
6327 sep = PyUnicode_FromObject(sep);
6328 if (sep == NULL) {
6329 Py_DECREF(s);
6330 return NULL;
6331 }
6332 }
6333
6334 result = rsplit((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
6335
6336 Py_DECREF(s);
6337 Py_XDECREF(sep);
6338 return result;
6339}
6340
6341PyDoc_STRVAR(rsplit__doc__,
6342"S.rsplit([sep [,maxsplit]]) -> list of strings\n\
6343\n\
6344Return a list of the words in S, using sep as the\n\
6345delimiter string, starting at the end of the string and\n\
6346working to the front. If maxsplit is given, at most maxsplit\n\
6347splits are done. If sep is not specified, any whitespace string\n\
6348is a separator.");
6349
6350static PyObject*
6351unicode_rsplit(PyUnicodeObject *self, PyObject *args)
6352{
6353 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006354 Py_ssize_t maxcount = -1;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006355
Martin v. Löwis18e16552006-02-15 17:27:45 +00006356 if (!PyArg_ParseTuple(args, "|On:rsplit", &substring, &maxcount))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006357 return NULL;
6358
6359 if (substring == Py_None)
6360 return rsplit(self, NULL, maxcount);
6361 else if (PyUnicode_Check(substring))
6362 return rsplit(self, (PyUnicodeObject *)substring, maxcount);
6363 else
6364 return PyUnicode_RSplit((PyObject *)self, substring, maxcount);
6365}
6366
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006367PyDoc_STRVAR(splitlines__doc__,
Guido van Rossum86662912000-04-11 15:38:46 +00006368"S.splitlines([keepends]]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006369\n\
6370Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +00006371Line breaks are not included in the resulting list unless keepends\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006372is given and true.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006373
6374static PyObject*
6375unicode_splitlines(PyUnicodeObject *self, PyObject *args)
6376{
Guido van Rossum86662912000-04-11 15:38:46 +00006377 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006378
Guido van Rossum86662912000-04-11 15:38:46 +00006379 if (!PyArg_ParseTuple(args, "|i:splitlines", &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006380 return NULL;
6381
Guido van Rossum86662912000-04-11 15:38:46 +00006382 return PyUnicode_Splitlines((PyObject *)self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006383}
6384
6385static
6386PyObject *unicode_str(PyUnicodeObject *self)
6387{
Fred Drakee4315f52000-05-09 19:53:39 +00006388 return PyUnicode_AsEncodedString((PyObject *)self, NULL, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006389}
6390
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006391PyDoc_STRVAR(swapcase__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006392"S.swapcase() -> unicode\n\
6393\n\
6394Return a copy of S with uppercase characters converted to lowercase\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006395and vice versa.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006396
6397static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006398unicode_swapcase(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006399{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006400 return fixup(self, fixswapcase);
6401}
6402
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006403PyDoc_STRVAR(translate__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006404"S.translate(table) -> unicode\n\
6405\n\
6406Return a copy of the string S, where all characters have been mapped\n\
6407through the given translation table, which must be a mapping of\n\
Walter Dörwald5c1ee172002-09-04 20:31:32 +00006408Unicode ordinals to Unicode ordinals, Unicode strings or None.\n\
6409Unmapped characters are left untouched. Characters mapped to None\n\
6410are deleted.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006411
6412static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006413unicode_translate(PyUnicodeObject *self, PyObject *table)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006414{
Tim Petersced69f82003-09-16 20:30:58 +00006415 return PyUnicode_TranslateCharmap(self->str,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006416 self->length,
Tim Petersced69f82003-09-16 20:30:58 +00006417 table,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006418 "ignore");
6419}
6420
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006421PyDoc_STRVAR(upper__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006422"S.upper() -> unicode\n\
6423\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006424Return a copy of S converted to uppercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006425
6426static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006427unicode_upper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006428{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006429 return fixup(self, fixupper);
6430}
6431
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006432PyDoc_STRVAR(zfill__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006433"S.zfill(width) -> unicode\n\
6434\n\
6435Pad a numeric string x with zeros on the left, to fill a field\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006436of the specified width. The string x is never truncated.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006437
6438static PyObject *
6439unicode_zfill(PyUnicodeObject *self, PyObject *args)
6440{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006441 Py_ssize_t fill;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006442 PyUnicodeObject *u;
6443
Martin v. Löwis18e16552006-02-15 17:27:45 +00006444 Py_ssize_t width;
6445 if (!PyArg_ParseTuple(args, "n:zfill", &width))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006446 return NULL;
6447
6448 if (self->length >= width) {
Walter Dörwald0fe940c2002-04-15 18:42:15 +00006449 if (PyUnicode_CheckExact(self)) {
6450 Py_INCREF(self);
6451 return (PyObject*) self;
6452 }
6453 else
6454 return PyUnicode_FromUnicode(
6455 PyUnicode_AS_UNICODE(self),
6456 PyUnicode_GET_SIZE(self)
6457 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006458 }
6459
6460 fill = width - self->length;
6461
6462 u = pad(self, fill, 0, '0');
6463
Walter Dörwald068325e2002-04-15 13:36:47 +00006464 if (u == NULL)
6465 return NULL;
6466
Guido van Rossumd57fd912000-03-10 22:53:23 +00006467 if (u->str[fill] == '+' || u->str[fill] == '-') {
6468 /* move sign to beginning of string */
6469 u->str[0] = u->str[fill];
6470 u->str[fill] = '0';
6471 }
6472
6473 return (PyObject*) u;
6474}
Guido van Rossumd57fd912000-03-10 22:53:23 +00006475
6476#if 0
6477static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006478unicode_freelistsize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006479{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006480 return PyInt_FromLong(unicode_freelist_size);
6481}
6482#endif
6483
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006484PyDoc_STRVAR(startswith__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006485"S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006486\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00006487Return True if S starts with the specified prefix, False otherwise.\n\
6488With optional start, test S beginning at that position.\n\
6489With optional end, stop comparing S at that position.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006490
6491static PyObject *
6492unicode_startswith(PyUnicodeObject *self,
6493 PyObject *args)
6494{
6495 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006496 Py_ssize_t start = 0;
Martin v. Löwis412fb672006-04-13 06:34:32 +00006497 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006498 PyObject *result;
6499
Guido van Rossumb8872e62000-05-09 14:14:27 +00006500 if (!PyArg_ParseTuple(args, "O|O&O&:startswith", &substring,
6501 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006502 return NULL;
6503 substring = (PyUnicodeObject *)PyUnicode_FromObject(
6504 (PyObject *)substring);
6505 if (substring == NULL)
6506 return NULL;
6507
Guido van Rossum77f6a652002-04-03 22:41:51 +00006508 result = PyBool_FromLong(tailmatch(self, substring, start, end, -1));
Guido van Rossumd57fd912000-03-10 22:53:23 +00006509
6510 Py_DECREF(substring);
6511 return result;
6512}
6513
6514
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006515PyDoc_STRVAR(endswith__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006516"S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006517\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00006518Return True if S ends with the specified suffix, False otherwise.\n\
6519With optional start, test S beginning at that position.\n\
6520With optional end, stop comparing S at that position.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006521
6522static PyObject *
6523unicode_endswith(PyUnicodeObject *self,
6524 PyObject *args)
6525{
6526 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006527 Py_ssize_t start = 0;
Martin v. Löwis412fb672006-04-13 06:34:32 +00006528 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006529 PyObject *result;
6530
Guido van Rossumb8872e62000-05-09 14:14:27 +00006531 if (!PyArg_ParseTuple(args, "O|O&O&:endswith", &substring,
6532 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006533 return NULL;
6534 substring = (PyUnicodeObject *)PyUnicode_FromObject(
6535 (PyObject *)substring);
6536 if (substring == NULL)
6537 return NULL;
6538
Guido van Rossum77f6a652002-04-03 22:41:51 +00006539 result = PyBool_FromLong(tailmatch(self, substring, start, end, +1));
Guido van Rossumd57fd912000-03-10 22:53:23 +00006540
6541 Py_DECREF(substring);
6542 return result;
6543}
6544
6545
Guido van Rossum5d9113d2003-01-29 17:58:45 +00006546
6547static PyObject *
6548unicode_getnewargs(PyUnicodeObject *v)
6549{
6550 return Py_BuildValue("(u#)", v->str, v->length);
6551}
6552
6553
Guido van Rossumd57fd912000-03-10 22:53:23 +00006554static PyMethodDef unicode_methods[] = {
6555
6556 /* Order is according to common usage: often used methods should
6557 appear first, since lookup is done sequentially. */
6558
Georg Brandlecdc0a92006-03-30 12:19:07 +00006559 {"encode", (PyCFunction) unicode_encode, METH_VARARGS, encode__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006560 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
6561 {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006562 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006563 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
6564 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
6565 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
6566 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
6567 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
6568 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
6569 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
6570 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
6571 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
6572 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006573 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006574 {"decode", (PyCFunction) unicode_decode, METH_VARARGS, decode__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006575/* {"maketrans", (PyCFunction) unicode_maketrans, METH_VARARGS, maketrans__doc__}, */
6576 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
6577 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
6578 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006579 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006580 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS, splitlines__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006581 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006582 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
6583 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
6584 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
6585 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
6586 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
6587 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
6588 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
6589 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
6590 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
6591 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
6592 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
6593 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
6594 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
6595 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006596 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
Walter Dörwald068325e2002-04-15 13:36:47 +00006597#if 0
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006598 {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
Guido van Rossumd57fd912000-03-10 22:53:23 +00006599#endif
6600
6601#if 0
6602 /* This one is just used for debugging the implementation. */
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006603 {"freelistsize", (PyCFunction) unicode_freelistsize, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00006604#endif
6605
Guido van Rossum5d9113d2003-01-29 17:58:45 +00006606 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00006607 {NULL, NULL}
6608};
6609
Neil Schemenauerce30bc92002-11-18 16:10:18 +00006610static PyObject *
6611unicode_mod(PyObject *v, PyObject *w)
6612{
6613 if (!PyUnicode_Check(v)) {
6614 Py_INCREF(Py_NotImplemented);
6615 return Py_NotImplemented;
6616 }
6617 return PyUnicode_Format(v, w);
6618}
6619
6620static PyNumberMethods unicode_as_number = {
6621 0, /*nb_add*/
6622 0, /*nb_subtract*/
6623 0, /*nb_multiply*/
6624 0, /*nb_divide*/
6625 unicode_mod, /*nb_remainder*/
6626};
6627
Guido van Rossumd57fd912000-03-10 22:53:23 +00006628static PySequenceMethods unicode_as_sequence = {
Martin v. Löwis18e16552006-02-15 17:27:45 +00006629 (lenfunc) unicode_length, /* sq_length */
Georg Brandl347b3002006-03-30 11:57:00 +00006630 PyUnicode_Concat, /* sq_concat */
Martin v. Löwis18e16552006-02-15 17:27:45 +00006631 (ssizeargfunc) unicode_repeat, /* sq_repeat */
6632 (ssizeargfunc) unicode_getitem, /* sq_item */
6633 (ssizessizeargfunc) unicode_slice, /* sq_slice */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006634 0, /* sq_ass_item */
6635 0, /* sq_ass_slice */
Georg Brandl347b3002006-03-30 11:57:00 +00006636 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006637};
6638
Guido van Rossum38fff8c2006-03-07 18:50:55 +00006639#define HASINDEX(o) PyType_HasFeature((o)->ob_type, Py_TPFLAGS_HAVE_INDEX)
6640
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00006641static PyObject*
6642unicode_subscript(PyUnicodeObject* self, PyObject* item)
6643{
Guido van Rossum38fff8c2006-03-07 18:50:55 +00006644 PyNumberMethods *nb = item->ob_type->tp_as_number;
6645 if (nb != NULL && HASINDEX(item) && nb->nb_index != NULL) {
6646 Py_ssize_t i = nb->nb_index(item);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00006647 if (i == -1 && PyErr_Occurred())
6648 return NULL;
6649 if (i < 0)
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006650 i += PyUnicode_GET_SIZE(self);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00006651 return unicode_getitem(self, i);
6652 } else if (PySlice_Check(item)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00006653 Py_ssize_t start, stop, step, slicelength, cur, i;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00006654 Py_UNICODE* source_buf;
6655 Py_UNICODE* result_buf;
6656 PyObject* result;
6657
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006658 if (PySlice_GetIndicesEx((PySliceObject*)item, PyUnicode_GET_SIZE(self),
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00006659 &start, &stop, &step, &slicelength) < 0) {
6660 return NULL;
6661 }
6662
6663 if (slicelength <= 0) {
6664 return PyUnicode_FromUnicode(NULL, 0);
6665 } else {
6666 source_buf = PyUnicode_AS_UNICODE((PyObject*)self);
Anthony Baxtera6286212006-04-11 07:42:36 +00006667 result_buf = (Py_UNICODE *)PyMem_MALLOC(slicelength*
6668 sizeof(Py_UNICODE));
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006669
6670 if (result_buf == NULL)
6671 return PyErr_NoMemory();
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00006672
6673 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
6674 result_buf[i] = source_buf[cur];
6675 }
Tim Petersced69f82003-09-16 20:30:58 +00006676
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00006677 result = PyUnicode_FromUnicode(result_buf, slicelength);
6678 PyMem_FREE(result_buf);
6679 return result;
6680 }
6681 } else {
6682 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
6683 return NULL;
6684 }
6685}
6686
6687static PyMappingMethods unicode_as_mapping = {
Martin v. Löwis18e16552006-02-15 17:27:45 +00006688 (lenfunc)unicode_length, /* mp_length */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00006689 (binaryfunc)unicode_subscript, /* mp_subscript */
6690 (objobjargproc)0, /* mp_ass_subscript */
6691};
6692
Martin v. Löwis18e16552006-02-15 17:27:45 +00006693static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +00006694unicode_buffer_getreadbuf(PyUnicodeObject *self,
Martin v. Löwis18e16552006-02-15 17:27:45 +00006695 Py_ssize_t index,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006696 const void **ptr)
6697{
6698 if (index != 0) {
6699 PyErr_SetString(PyExc_SystemError,
6700 "accessing non-existent unicode segment");
6701 return -1;
6702 }
6703 *ptr = (void *) self->str;
6704 return PyUnicode_GET_DATA_SIZE(self);
6705}
6706
Martin v. Löwis18e16552006-02-15 17:27:45 +00006707static Py_ssize_t
6708unicode_buffer_getwritebuf(PyUnicodeObject *self, Py_ssize_t index,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006709 const void **ptr)
6710{
6711 PyErr_SetString(PyExc_TypeError,
Neal Norwitz20e72132002-06-13 21:25:17 +00006712 "cannot use unicode as modifiable buffer");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006713 return -1;
6714}
6715
6716static int
6717unicode_buffer_getsegcount(PyUnicodeObject *self,
Martin v. Löwis18e16552006-02-15 17:27:45 +00006718 Py_ssize_t *lenp)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006719{
6720 if (lenp)
6721 *lenp = PyUnicode_GET_DATA_SIZE(self);
6722 return 1;
6723}
6724
Martin v. Löwiseb079f12006-02-16 14:32:27 +00006725static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +00006726unicode_buffer_getcharbuf(PyUnicodeObject *self,
Martin v. Löwis18e16552006-02-15 17:27:45 +00006727 Py_ssize_t index,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006728 const void **ptr)
6729{
6730 PyObject *str;
Tim Petersced69f82003-09-16 20:30:58 +00006731
Guido van Rossumd57fd912000-03-10 22:53:23 +00006732 if (index != 0) {
6733 PyErr_SetString(PyExc_SystemError,
6734 "accessing non-existent unicode segment");
6735 return -1;
6736 }
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00006737 str = _PyUnicode_AsDefaultEncodedString((PyObject *)self, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006738 if (str == NULL)
6739 return -1;
6740 *ptr = (void *) PyString_AS_STRING(str);
6741 return PyString_GET_SIZE(str);
6742}
6743
6744/* Helpers for PyUnicode_Format() */
6745
6746static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +00006747getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006748{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006749 Py_ssize_t argidx = *p_argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006750 if (argidx < arglen) {
6751 (*p_argidx)++;
6752 if (arglen < 0)
6753 return args;
6754 else
6755 return PyTuple_GetItem(args, argidx);
6756 }
6757 PyErr_SetString(PyExc_TypeError,
6758 "not enough arguments for format string");
6759 return NULL;
6760}
6761
6762#define F_LJUST (1<<0)
6763#define F_SIGN (1<<1)
6764#define F_BLANK (1<<2)
6765#define F_ALT (1<<3)
6766#define F_ZERO (1<<4)
6767
Martin v. Löwis18e16552006-02-15 17:27:45 +00006768static Py_ssize_t
Neal Norwitzfc76d632006-01-10 06:03:13 +00006769strtounicode(Py_UNICODE *buffer, const char *charbuffer)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006770{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006771 register Py_ssize_t i;
6772 Py_ssize_t len = strlen(charbuffer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006773 for (i = len - 1; i >= 0; i--)
6774 buffer[i] = (Py_UNICODE) charbuffer[i];
6775
Guido van Rossumd57fd912000-03-10 22:53:23 +00006776 return len;
6777}
6778
Neal Norwitzfc76d632006-01-10 06:03:13 +00006779static int
6780doubletounicode(Py_UNICODE *buffer, size_t len, const char *format, double x)
6781{
Tim Peters15231542006-02-16 01:08:01 +00006782 Py_ssize_t result;
6783
Neal Norwitzfc76d632006-01-10 06:03:13 +00006784 PyOS_ascii_formatd((char *)buffer, len, format, x);
Tim Peters15231542006-02-16 01:08:01 +00006785 result = strtounicode(buffer, (char *)buffer);
6786 return Py_SAFE_DOWNCAST(result, Py_ssize_t, int);
Neal Norwitzfc76d632006-01-10 06:03:13 +00006787}
6788
6789static int
6790longtounicode(Py_UNICODE *buffer, size_t len, const char *format, long x)
6791{
Tim Peters15231542006-02-16 01:08:01 +00006792 Py_ssize_t result;
6793
Neal Norwitzfc76d632006-01-10 06:03:13 +00006794 PyOS_snprintf((char *)buffer, len, format, x);
Tim Peters15231542006-02-16 01:08:01 +00006795 result = strtounicode(buffer, (char *)buffer);
6796 return Py_SAFE_DOWNCAST(result, Py_ssize_t, int);
Neal Norwitzfc76d632006-01-10 06:03:13 +00006797}
6798
Guido van Rossum078151d2002-08-11 04:24:12 +00006799/* XXX To save some code duplication, formatfloat/long/int could have been
6800 shared with stringobject.c, converting from 8-bit to Unicode after the
6801 formatting is done. */
6802
Guido van Rossumd57fd912000-03-10 22:53:23 +00006803static int
6804formatfloat(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006805 size_t buflen,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006806 int flags,
6807 int prec,
6808 int type,
6809 PyObject *v)
6810{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006811 /* fmt = '%#.' + `prec` + `type`
6812 worst case length = 3 + 10 (len of INT_MAX) + 1 = 14 (use 20)*/
Guido van Rossumd57fd912000-03-10 22:53:23 +00006813 char fmt[20];
6814 double x;
Tim Petersced69f82003-09-16 20:30:58 +00006815
Guido van Rossumd57fd912000-03-10 22:53:23 +00006816 x = PyFloat_AsDouble(v);
6817 if (x == -1.0 && PyErr_Occurred())
6818 return -1;
6819 if (prec < 0)
6820 prec = 6;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006821 if (type == 'f' && (fabs(x) / 1e25) >= 1e25)
6822 type = 'g';
Marc-André Lemburg79f57832002-12-29 19:44:06 +00006823 /* Worst case length calc to ensure no buffer overrun:
6824
6825 'g' formats:
6826 fmt = %#.<prec>g
6827 buf = '-' + [0-9]*prec + '.' + 'e+' + (longest exp
6828 for any double rep.)
6829 len = 1 + prec + 1 + 2 + 5 = 9 + prec
6830
6831 'f' formats:
6832 buf = '-' + [0-9]*x + '.' + [0-9]*prec (with x < 50)
6833 len = 1 + 50 + 1 + prec = 52 + prec
6834
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006835 If prec=0 the effective precision is 1 (the leading digit is
Tim Petersced69f82003-09-16 20:30:58 +00006836 always given), therefore increase the length by one.
Marc-André Lemburg79f57832002-12-29 19:44:06 +00006837
6838 */
6839 if ((type == 'g' && buflen <= (size_t)10 + (size_t)prec) ||
6840 (type == 'f' && buflen <= (size_t)53 + (size_t)prec)) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006841 PyErr_SetString(PyExc_OverflowError,
Marc-André Lemburg79f57832002-12-29 19:44:06 +00006842 "formatted float is too long (precision too large?)");
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006843 return -1;
6844 }
Marc-André Lemburg79f57832002-12-29 19:44:06 +00006845 PyOS_snprintf(fmt, sizeof(fmt), "%%%s.%d%c",
6846 (flags&F_ALT) ? "#" : "",
6847 prec, type);
Neal Norwitzfc76d632006-01-10 06:03:13 +00006848 return doubletounicode(buf, buflen, fmt, x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006849}
6850
Tim Peters38fd5b62000-09-21 05:43:11 +00006851static PyObject*
6852formatlong(PyObject *val, int flags, int prec, int type)
6853{
6854 char *buf;
6855 int i, len;
6856 PyObject *str; /* temporary string object. */
6857 PyUnicodeObject *result;
6858
6859 str = _PyString_FormatLong(val, flags, prec, type, &buf, &len);
6860 if (!str)
6861 return NULL;
6862 result = _PyUnicode_New(len);
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00006863 if (!result) {
6864 Py_DECREF(str);
6865 return NULL;
6866 }
Tim Peters38fd5b62000-09-21 05:43:11 +00006867 for (i = 0; i < len; i++)
6868 result->str[i] = buf[i];
6869 result->str[len] = 0;
6870 Py_DECREF(str);
6871 return (PyObject*)result;
6872}
6873
Guido van Rossumd57fd912000-03-10 22:53:23 +00006874static int
6875formatint(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006876 size_t buflen,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006877 int flags,
6878 int prec,
6879 int type,
6880 PyObject *v)
6881{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006882 /* fmt = '%#.' + `prec` + 'l' + `type`
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006883 * worst case length = 3 + 19 (worst len of INT_MAX on 64-bit machine)
6884 * + 1 + 1
6885 * = 24
6886 */
Tim Peters38fd5b62000-09-21 05:43:11 +00006887 char fmt[64]; /* plenty big enough! */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00006888 char *sign;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006889 long x;
6890
6891 x = PyInt_AsLong(v);
6892 if (x == -1 && PyErr_Occurred())
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006893 return -1;
Guido van Rossum6c9e1302003-11-29 23:52:13 +00006894 if (x < 0 && type == 'u') {
6895 type = 'd';
Guido van Rossum078151d2002-08-11 04:24:12 +00006896 }
Guido van Rossum6c9e1302003-11-29 23:52:13 +00006897 if (x < 0 && (type == 'x' || type == 'X' || type == 'o'))
6898 sign = "-";
6899 else
6900 sign = "";
Guido van Rossumd57fd912000-03-10 22:53:23 +00006901 if (prec < 0)
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006902 prec = 1;
6903
Guido van Rossum6c9e1302003-11-29 23:52:13 +00006904 /* buf = '+'/'-'/'' + '0'/'0x'/'' + '[0-9]'*max(prec, len(x in octal))
6905 * worst case buf = '-0x' + [0-9]*prec, where prec >= 11
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006906 */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00006907 if (buflen <= 14 || buflen <= (size_t)3 + (size_t)prec) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006908 PyErr_SetString(PyExc_OverflowError,
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006909 "formatted integer is too long (precision too large?)");
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006910 return -1;
6911 }
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006912
6913 if ((flags & F_ALT) &&
6914 (type == 'x' || type == 'X')) {
Tim Petersced69f82003-09-16 20:30:58 +00006915 /* When converting under %#x or %#X, there are a number
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006916 * of issues that cause pain:
6917 * - when 0 is being converted, the C standard leaves off
6918 * the '0x' or '0X', which is inconsistent with other
6919 * %#x/%#X conversions and inconsistent with Python's
6920 * hex() function
6921 * - there are platforms that violate the standard and
6922 * convert 0 with the '0x' or '0X'
6923 * (Metrowerks, Compaq Tru64)
6924 * - there are platforms that give '0x' when converting
Tim Petersced69f82003-09-16 20:30:58 +00006925 * under %#X, but convert 0 in accordance with the
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006926 * standard (OS/2 EMX)
Tim Petersced69f82003-09-16 20:30:58 +00006927 *
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006928 * We can achieve the desired consistency by inserting our
6929 * own '0x' or '0X' prefix, and substituting %x/%X in place
6930 * of %#x/%#X.
6931 *
6932 * Note that this is the same approach as used in
6933 * formatint() in stringobject.c
Andrew MacIntyrec4874392002-02-26 11:36:35 +00006934 */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00006935 PyOS_snprintf(fmt, sizeof(fmt), "%s0%c%%.%dl%c",
6936 sign, type, prec, type);
Andrew MacIntyrec4874392002-02-26 11:36:35 +00006937 }
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006938 else {
Guido van Rossum6c9e1302003-11-29 23:52:13 +00006939 PyOS_snprintf(fmt, sizeof(fmt), "%s%%%s.%dl%c",
6940 sign, (flags&F_ALT) ? "#" : "",
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006941 prec, type);
Tim Petersb3d8d1f2001-04-28 05:38:26 +00006942 }
Guido van Rossum6c9e1302003-11-29 23:52:13 +00006943 if (sign[0])
Neal Norwitzfc76d632006-01-10 06:03:13 +00006944 return longtounicode(buf, buflen, fmt, -x);
Guido van Rossum6c9e1302003-11-29 23:52:13 +00006945 else
Neal Norwitzfc76d632006-01-10 06:03:13 +00006946 return longtounicode(buf, buflen, fmt, x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006947}
6948
6949static int
6950formatchar(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006951 size_t buflen,
6952 PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006953{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006954 /* presume that the buffer is at least 2 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00006955 if (PyUnicode_Check(v)) {
6956 if (PyUnicode_GET_SIZE(v) != 1)
6957 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006958 buf[0] = PyUnicode_AS_UNICODE(v)[0];
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00006959 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006960
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00006961 else if (PyString_Check(v)) {
Tim Petersced69f82003-09-16 20:30:58 +00006962 if (PyString_GET_SIZE(v) != 1)
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00006963 goto onError;
6964 buf[0] = (Py_UNICODE)PyString_AS_STRING(v)[0];
6965 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006966
6967 else {
6968 /* Integer input truncated to a character */
6969 long x;
6970 x = PyInt_AsLong(v);
6971 if (x == -1 && PyErr_Occurred())
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00006972 goto onError;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00006973#ifdef Py_UNICODE_WIDE
6974 if (x < 0 || x > 0x10ffff) {
Walter Dörwald44f527f2003-04-02 16:37:24 +00006975 PyErr_SetString(PyExc_OverflowError,
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00006976 "%c arg not in range(0x110000) "
6977 "(wide Python build)");
6978 return -1;
6979 }
6980#else
6981 if (x < 0 || x > 0xffff) {
Walter Dörwald44f527f2003-04-02 16:37:24 +00006982 PyErr_SetString(PyExc_OverflowError,
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00006983 "%c arg not in range(0x10000) "
6984 "(narrow Python build)");
6985 return -1;
6986 }
6987#endif
6988 buf[0] = (Py_UNICODE) x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006989 }
6990 buf[1] = '\0';
6991 return 1;
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00006992
6993 onError:
6994 PyErr_SetString(PyExc_TypeError,
6995 "%c requires int or char");
6996 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006997}
6998
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006999/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
7000
7001 FORMATBUFLEN is the length of the buffer in which the floats, ints, &
7002 chars are formatted. XXX This is a magic number. Each formatting
7003 routine does bounds checking to ensure no overflow, but a better
7004 solution may be to malloc a buffer of appropriate size for each
7005 format. For now, the current solution is sufficient.
7006*/
7007#define FORMATBUFLEN (size_t)120
7008
Guido van Rossumd57fd912000-03-10 22:53:23 +00007009PyObject *PyUnicode_Format(PyObject *format,
7010 PyObject *args)
7011{
7012 Py_UNICODE *fmt, *res;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007013 Py_ssize_t fmtcnt, rescnt, reslen, arglen, argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007014 int args_owned = 0;
7015 PyUnicodeObject *result = NULL;
7016 PyObject *dict = NULL;
7017 PyObject *uformat;
Tim Petersced69f82003-09-16 20:30:58 +00007018
Guido van Rossumd57fd912000-03-10 22:53:23 +00007019 if (format == NULL || args == NULL) {
7020 PyErr_BadInternalCall();
7021 return NULL;
7022 }
7023 uformat = PyUnicode_FromObject(format);
Fred Drakee4315f52000-05-09 19:53:39 +00007024 if (uformat == NULL)
7025 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007026 fmt = PyUnicode_AS_UNICODE(uformat);
7027 fmtcnt = PyUnicode_GET_SIZE(uformat);
7028
7029 reslen = rescnt = fmtcnt + 100;
7030 result = _PyUnicode_New(reslen);
7031 if (result == NULL)
7032 goto onError;
7033 res = PyUnicode_AS_UNICODE(result);
7034
7035 if (PyTuple_Check(args)) {
7036 arglen = PyTuple_Size(args);
7037 argidx = 0;
7038 }
7039 else {
7040 arglen = -1;
7041 argidx = -2;
7042 }
Neal Norwitz80a1bf42002-11-12 23:01:12 +00007043 if (args->ob_type->tp_as_mapping && !PyTuple_Check(args) &&
7044 !PyObject_TypeCheck(args, &PyBaseString_Type))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007045 dict = args;
7046
7047 while (--fmtcnt >= 0) {
7048 if (*fmt != '%') {
7049 if (--rescnt < 0) {
7050 rescnt = fmtcnt + 100;
7051 reslen += rescnt;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00007052 if (_PyUnicode_Resize(&result, reslen) < 0)
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00007053 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007054 res = PyUnicode_AS_UNICODE(result) + reslen - rescnt;
7055 --rescnt;
7056 }
7057 *res++ = *fmt++;
7058 }
7059 else {
7060 /* Got a format specifier */
7061 int flags = 0;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007062 Py_ssize_t width = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007063 int prec = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007064 Py_UNICODE c = '\0';
7065 Py_UNICODE fill;
7066 PyObject *v = NULL;
7067 PyObject *temp = NULL;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007068 Py_UNICODE *pbuf;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007069 Py_UNICODE sign;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007070 Py_ssize_t len;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007071 Py_UNICODE formatbuf[FORMATBUFLEN]; /* For format{float,int,char}() */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007072
7073 fmt++;
7074 if (*fmt == '(') {
7075 Py_UNICODE *keystart;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007076 Py_ssize_t keylen;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007077 PyObject *key;
7078 int pcount = 1;
7079
7080 if (dict == NULL) {
7081 PyErr_SetString(PyExc_TypeError,
Tim Petersced69f82003-09-16 20:30:58 +00007082 "format requires a mapping");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007083 goto onError;
7084 }
7085 ++fmt;
7086 --fmtcnt;
7087 keystart = fmt;
7088 /* Skip over balanced parentheses */
7089 while (pcount > 0 && --fmtcnt >= 0) {
7090 if (*fmt == ')')
7091 --pcount;
7092 else if (*fmt == '(')
7093 ++pcount;
7094 fmt++;
7095 }
7096 keylen = fmt - keystart - 1;
7097 if (fmtcnt < 0 || pcount > 0) {
7098 PyErr_SetString(PyExc_ValueError,
7099 "incomplete format key");
7100 goto onError;
7101 }
Marc-André Lemburg72f82132001-11-20 15:18:49 +00007102#if 0
Fred Drakee4315f52000-05-09 19:53:39 +00007103 /* keys are converted to strings using UTF-8 and
Guido van Rossumd57fd912000-03-10 22:53:23 +00007104 then looked up since Python uses strings to hold
7105 variables names etc. in its namespaces and we
Fred Drakee4315f52000-05-09 19:53:39 +00007106 wouldn't want to break common idioms. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007107 key = PyUnicode_EncodeUTF8(keystart,
7108 keylen,
7109 NULL);
Marc-André Lemburg72f82132001-11-20 15:18:49 +00007110#else
7111 key = PyUnicode_FromUnicode(keystart, keylen);
7112#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00007113 if (key == NULL)
7114 goto onError;
7115 if (args_owned) {
7116 Py_DECREF(args);
7117 args_owned = 0;
7118 }
7119 args = PyObject_GetItem(dict, key);
7120 Py_DECREF(key);
7121 if (args == NULL) {
7122 goto onError;
7123 }
7124 args_owned = 1;
7125 arglen = -1;
7126 argidx = -2;
7127 }
7128 while (--fmtcnt >= 0) {
7129 switch (c = *fmt++) {
7130 case '-': flags |= F_LJUST; continue;
7131 case '+': flags |= F_SIGN; continue;
7132 case ' ': flags |= F_BLANK; continue;
7133 case '#': flags |= F_ALT; continue;
7134 case '0': flags |= F_ZERO; continue;
7135 }
7136 break;
7137 }
7138 if (c == '*') {
7139 v = getnextarg(args, arglen, &argidx);
7140 if (v == NULL)
7141 goto onError;
7142 if (!PyInt_Check(v)) {
7143 PyErr_SetString(PyExc_TypeError,
7144 "* wants int");
7145 goto onError;
7146 }
7147 width = PyInt_AsLong(v);
7148 if (width < 0) {
7149 flags |= F_LJUST;
7150 width = -width;
7151 }
7152 if (--fmtcnt >= 0)
7153 c = *fmt++;
7154 }
7155 else if (c >= '0' && c <= '9') {
7156 width = c - '0';
7157 while (--fmtcnt >= 0) {
7158 c = *fmt++;
7159 if (c < '0' || c > '9')
7160 break;
7161 if ((width*10) / 10 != width) {
7162 PyErr_SetString(PyExc_ValueError,
7163 "width too big");
7164 goto onError;
7165 }
7166 width = width*10 + (c - '0');
7167 }
7168 }
7169 if (c == '.') {
7170 prec = 0;
7171 if (--fmtcnt >= 0)
7172 c = *fmt++;
7173 if (c == '*') {
7174 v = getnextarg(args, arglen, &argidx);
7175 if (v == NULL)
7176 goto onError;
7177 if (!PyInt_Check(v)) {
7178 PyErr_SetString(PyExc_TypeError,
7179 "* wants int");
7180 goto onError;
7181 }
7182 prec = PyInt_AsLong(v);
7183 if (prec < 0)
7184 prec = 0;
7185 if (--fmtcnt >= 0)
7186 c = *fmt++;
7187 }
7188 else if (c >= '0' && c <= '9') {
7189 prec = c - '0';
7190 while (--fmtcnt >= 0) {
7191 c = Py_CHARMASK(*fmt++);
7192 if (c < '0' || c > '9')
7193 break;
7194 if ((prec*10) / 10 != prec) {
7195 PyErr_SetString(PyExc_ValueError,
7196 "prec too big");
7197 goto onError;
7198 }
7199 prec = prec*10 + (c - '0');
7200 }
7201 }
7202 } /* prec */
7203 if (fmtcnt >= 0) {
7204 if (c == 'h' || c == 'l' || c == 'L') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007205 if (--fmtcnt >= 0)
7206 c = *fmt++;
7207 }
7208 }
7209 if (fmtcnt < 0) {
7210 PyErr_SetString(PyExc_ValueError,
7211 "incomplete format");
7212 goto onError;
7213 }
7214 if (c != '%') {
7215 v = getnextarg(args, arglen, &argidx);
7216 if (v == NULL)
7217 goto onError;
7218 }
7219 sign = 0;
7220 fill = ' ';
7221 switch (c) {
7222
7223 case '%':
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007224 pbuf = formatbuf;
7225 /* presume that buffer length is at least 1 */
7226 pbuf[0] = '%';
Guido van Rossumd57fd912000-03-10 22:53:23 +00007227 len = 1;
7228 break;
7229
7230 case 's':
7231 case 'r':
7232 if (PyUnicode_Check(v) && c == 's') {
7233 temp = v;
7234 Py_INCREF(temp);
7235 }
7236 else {
7237 PyObject *unicode;
7238 if (c == 's')
Marc-André Lemburgd25c6502004-07-23 16:13:25 +00007239 temp = PyObject_Unicode(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007240 else
7241 temp = PyObject_Repr(v);
7242 if (temp == NULL)
7243 goto onError;
Marc-André Lemburgd25c6502004-07-23 16:13:25 +00007244 if (PyUnicode_Check(temp))
7245 /* nothing to do */;
7246 else if (PyString_Check(temp)) {
7247 /* convert to string to Unicode */
Thomas Woutersa96affe2006-03-12 00:29:36 +00007248 unicode = PyUnicode_Decode(PyString_AS_STRING(temp),
Guido van Rossumd57fd912000-03-10 22:53:23 +00007249 PyString_GET_SIZE(temp),
Thomas Woutersa96affe2006-03-12 00:29:36 +00007250 NULL,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007251 "strict");
Thomas Woutersa96affe2006-03-12 00:29:36 +00007252 Py_DECREF(temp);
7253 temp = unicode;
7254 if (temp == NULL)
7255 goto onError;
7256 }
Marc-André Lemburgd25c6502004-07-23 16:13:25 +00007257 else {
7258 Py_DECREF(temp);
7259 PyErr_SetString(PyExc_TypeError,
7260 "%s argument has non-string str()");
7261 goto onError;
7262 }
7263 }
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007264 pbuf = PyUnicode_AS_UNICODE(temp);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007265 len = PyUnicode_GET_SIZE(temp);
7266 if (prec >= 0 && len > prec)
7267 len = prec;
7268 break;
7269
7270 case 'i':
7271 case 'd':
7272 case 'u':
7273 case 'o':
7274 case 'x':
7275 case 'X':
7276 if (c == 'i')
7277 c = 'd';
Tim Petersa3a3a032000-11-30 05:22:44 +00007278 if (PyLong_Check(v)) {
Tim Peters38fd5b62000-09-21 05:43:11 +00007279 temp = formatlong(v, flags, prec, c);
7280 if (!temp)
7281 goto onError;
7282 pbuf = PyUnicode_AS_UNICODE(temp);
7283 len = PyUnicode_GET_SIZE(temp);
Tim Peters38fd5b62000-09-21 05:43:11 +00007284 sign = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007285 }
Tim Peters38fd5b62000-09-21 05:43:11 +00007286 else {
7287 pbuf = formatbuf;
7288 len = formatint(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
7289 flags, prec, c, v);
7290 if (len < 0)
7291 goto onError;
Guido van Rossum6c9e1302003-11-29 23:52:13 +00007292 sign = 1;
Tim Peters38fd5b62000-09-21 05:43:11 +00007293 }
7294 if (flags & F_ZERO)
7295 fill = '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00007296 break;
7297
7298 case 'e':
7299 case 'E':
7300 case 'f':
Raymond Hettinger9bfe5332003-08-27 04:55:52 +00007301 case 'F':
Guido van Rossumd57fd912000-03-10 22:53:23 +00007302 case 'g':
7303 case 'G':
Raymond Hettinger9bfe5332003-08-27 04:55:52 +00007304 if (c == 'F')
7305 c = 'f';
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007306 pbuf = formatbuf;
7307 len = formatfloat(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
7308 flags, prec, c, v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007309 if (len < 0)
7310 goto onError;
7311 sign = 1;
Tim Peters38fd5b62000-09-21 05:43:11 +00007312 if (flags & F_ZERO)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007313 fill = '0';
7314 break;
7315
7316 case 'c':
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007317 pbuf = formatbuf;
7318 len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007319 if (len < 0)
7320 goto onError;
7321 break;
7322
7323 default:
7324 PyErr_Format(PyExc_ValueError,
Andrew M. Kuchling6ca89172000-12-15 13:07:46 +00007325 "unsupported format character '%c' (0x%x) "
7326 "at index %i",
Tim Petersced69f82003-09-16 20:30:58 +00007327 (31<=c && c<=126) ? (char)c : '?',
Marc-André Lemburg24e53b62002-09-24 09:32:14 +00007328 (int)c,
Guido van Rossumefc11882002-09-12 14:43:41 +00007329 (int)(fmt -1 - PyUnicode_AS_UNICODE(uformat)));
Guido van Rossumd57fd912000-03-10 22:53:23 +00007330 goto onError;
7331 }
7332 if (sign) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007333 if (*pbuf == '-' || *pbuf == '+') {
7334 sign = *pbuf++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007335 len--;
7336 }
7337 else if (flags & F_SIGN)
7338 sign = '+';
7339 else if (flags & F_BLANK)
7340 sign = ' ';
7341 else
7342 sign = 0;
7343 }
7344 if (width < len)
7345 width = len;
Guido van Rossum049cd6b2002-10-11 00:43:48 +00007346 if (rescnt - (sign != 0) < width) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007347 reslen -= rescnt;
7348 rescnt = width + fmtcnt + 100;
7349 reslen += rescnt;
Guido van Rossum049cd6b2002-10-11 00:43:48 +00007350 if (reslen < 0) {
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00007351 Py_XDECREF(temp);
Thomas Woutersa96affe2006-03-12 00:29:36 +00007352 PyErr_NoMemory();
7353 goto onError;
Guido van Rossum049cd6b2002-10-11 00:43:48 +00007354 }
Thomas Woutersa96affe2006-03-12 00:29:36 +00007355 if (_PyUnicode_Resize(&result, reslen) < 0) {
7356 Py_XDECREF(temp);
7357 goto onError;
7358 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007359 res = PyUnicode_AS_UNICODE(result)
7360 + reslen - rescnt;
7361 }
7362 if (sign) {
7363 if (fill != ' ')
7364 *res++ = sign;
7365 rescnt--;
7366 if (width > len)
7367 width--;
7368 }
Tim Peters38fd5b62000-09-21 05:43:11 +00007369 if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
7370 assert(pbuf[0] == '0');
Tim Petersfff53252001-04-12 18:38:48 +00007371 assert(pbuf[1] == c);
7372 if (fill != ' ') {
7373 *res++ = *pbuf++;
7374 *res++ = *pbuf++;
Tim Peters38fd5b62000-09-21 05:43:11 +00007375 }
Tim Petersfff53252001-04-12 18:38:48 +00007376 rescnt -= 2;
7377 width -= 2;
7378 if (width < 0)
7379 width = 0;
7380 len -= 2;
Tim Peters38fd5b62000-09-21 05:43:11 +00007381 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007382 if (width > len && !(flags & F_LJUST)) {
7383 do {
7384 --rescnt;
7385 *res++ = fill;
7386 } while (--width > len);
7387 }
Tim Peters38fd5b62000-09-21 05:43:11 +00007388 if (fill == ' ') {
7389 if (sign)
7390 *res++ = sign;
Tim Petersfff53252001-04-12 18:38:48 +00007391 if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
Tim Peters38fd5b62000-09-21 05:43:11 +00007392 assert(pbuf[0] == '0');
Tim Petersfff53252001-04-12 18:38:48 +00007393 assert(pbuf[1] == c);
Tim Peters38fd5b62000-09-21 05:43:11 +00007394 *res++ = *pbuf++;
7395 *res++ = *pbuf++;
7396 }
7397 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00007398 Py_UNICODE_COPY(res, pbuf, len);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007399 res += len;
7400 rescnt -= len;
7401 while (--width >= len) {
7402 --rescnt;
7403 *res++ = ' ';
7404 }
7405 if (dict && (argidx < arglen) && c != '%') {
7406 PyErr_SetString(PyExc_TypeError,
Raymond Hettinger0ebac972002-05-21 15:14:57 +00007407 "not all arguments converted during string formatting");
Thomas Woutersa96affe2006-03-12 00:29:36 +00007408 Py_XDECREF(temp);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007409 goto onError;
7410 }
7411 Py_XDECREF(temp);
7412 } /* '%' */
7413 } /* until end */
7414 if (argidx < arglen && !dict) {
7415 PyErr_SetString(PyExc_TypeError,
Raymond Hettinger0ebac972002-05-21 15:14:57 +00007416 "not all arguments converted during string formatting");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007417 goto onError;
7418 }
7419
Thomas Woutersa96affe2006-03-12 00:29:36 +00007420 if (_PyUnicode_Resize(&result, reslen - rescnt) < 0)
7421 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007422 if (args_owned) {
7423 Py_DECREF(args);
7424 }
7425 Py_DECREF(uformat);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007426 return (PyObject *)result;
7427
7428 onError:
7429 Py_XDECREF(result);
7430 Py_DECREF(uformat);
7431 if (args_owned) {
7432 Py_DECREF(args);
7433 }
7434 return NULL;
7435}
7436
7437static PyBufferProcs unicode_as_buffer = {
Martin v. Löwis18e16552006-02-15 17:27:45 +00007438 (readbufferproc) unicode_buffer_getreadbuf,
7439 (writebufferproc) unicode_buffer_getwritebuf,
7440 (segcountproc) unicode_buffer_getsegcount,
7441 (charbufferproc) unicode_buffer_getcharbuf,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007442};
7443
Jeremy Hylton938ace62002-07-17 16:30:39 +00007444static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +00007445unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
7446
Tim Peters6d6c1a32001-08-02 04:15:00 +00007447static PyObject *
7448unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
7449{
7450 PyObject *x = NULL;
Martin v. Löwis15e62742006-02-27 16:46:16 +00007451 static char *kwlist[] = {"string", "encoding", "errors", 0};
Tim Peters6d6c1a32001-08-02 04:15:00 +00007452 char *encoding = NULL;
7453 char *errors = NULL;
7454
Guido van Rossume023fe02001-08-30 03:12:59 +00007455 if (type != &PyUnicode_Type)
7456 return unicode_subtype_new(type, args, kwds);
Tim Peters6d6c1a32001-08-02 04:15:00 +00007457 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:unicode",
7458 kwlist, &x, &encoding, &errors))
7459 return NULL;
7460 if (x == NULL)
7461 return (PyObject *)_PyUnicode_New(0);
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00007462 if (encoding == NULL && errors == NULL)
7463 return PyObject_Unicode(x);
7464 else
Tim Peters6d6c1a32001-08-02 04:15:00 +00007465 return PyUnicode_FromEncodedObject(x, encoding, errors);
7466}
7467
Guido van Rossume023fe02001-08-30 03:12:59 +00007468static PyObject *
7469unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
7470{
Tim Petersaf90b3e2001-09-12 05:18:58 +00007471 PyUnicodeObject *tmp, *pnew;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007472 Py_ssize_t n;
Guido van Rossume023fe02001-08-30 03:12:59 +00007473
7474 assert(PyType_IsSubtype(type, &PyUnicode_Type));
7475 tmp = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds);
7476 if (tmp == NULL)
7477 return NULL;
7478 assert(PyUnicode_Check(tmp));
Tim Petersaf90b3e2001-09-12 05:18:58 +00007479 pnew = (PyUnicodeObject *) type->tp_alloc(type, n = tmp->length);
Raymond Hettingerf4667932003-06-28 20:04:25 +00007480 if (pnew == NULL) {
7481 Py_DECREF(tmp);
Guido van Rossume023fe02001-08-30 03:12:59 +00007482 return NULL;
Raymond Hettingerf4667932003-06-28 20:04:25 +00007483 }
Tim Petersaf90b3e2001-09-12 05:18:58 +00007484 pnew->str = PyMem_NEW(Py_UNICODE, n+1);
7485 if (pnew->str == NULL) {
7486 _Py_ForgetReference((PyObject *)pnew);
Neil Schemenauer58aa8612002-04-12 03:07:20 +00007487 PyObject_Del(pnew);
Raymond Hettingerf4667932003-06-28 20:04:25 +00007488 Py_DECREF(tmp);
Neal Norwitzec74f2f2003-02-11 23:05:40 +00007489 return PyErr_NoMemory();
Guido van Rossume023fe02001-08-30 03:12:59 +00007490 }
Tim Petersaf90b3e2001-09-12 05:18:58 +00007491 Py_UNICODE_COPY(pnew->str, tmp->str, n+1);
7492 pnew->length = n;
7493 pnew->hash = tmp->hash;
Guido van Rossume023fe02001-08-30 03:12:59 +00007494 Py_DECREF(tmp);
Tim Petersaf90b3e2001-09-12 05:18:58 +00007495 return (PyObject *)pnew;
Guido van Rossume023fe02001-08-30 03:12:59 +00007496}
7497
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007498PyDoc_STRVAR(unicode_doc,
Tim Peters6d6c1a32001-08-02 04:15:00 +00007499"unicode(string [, encoding[, errors]]) -> object\n\
7500\n\
7501Create a new Unicode object from the given encoded string.\n\
Skip Montanaro35b37a52002-07-26 16:22:46 +00007502encoding defaults to the current default string encoding.\n\
7503errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +00007504
Guido van Rossumd57fd912000-03-10 22:53:23 +00007505PyTypeObject PyUnicode_Type = {
7506 PyObject_HEAD_INIT(&PyType_Type)
7507 0, /* ob_size */
7508 "unicode", /* tp_name */
7509 sizeof(PyUnicodeObject), /* tp_size */
7510 0, /* tp_itemsize */
7511 /* Slots */
Guido van Rossum9475a232001-10-05 20:51:39 +00007512 (destructor)unicode_dealloc, /* tp_dealloc */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007513 0, /* tp_print */
Tim Peters6d6c1a32001-08-02 04:15:00 +00007514 0, /* tp_getattr */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007515 0, /* tp_setattr */
7516 (cmpfunc) unicode_compare, /* tp_compare */
Georg Brandl347b3002006-03-30 11:57:00 +00007517 unicode_repr, /* tp_repr */
Neil Schemenauerce30bc92002-11-18 16:10:18 +00007518 &unicode_as_number, /* tp_as_number */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007519 &unicode_as_sequence, /* tp_as_sequence */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007520 &unicode_as_mapping, /* tp_as_mapping */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007521 (hashfunc) unicode_hash, /* tp_hash*/
7522 0, /* tp_call*/
7523 (reprfunc) unicode_str, /* tp_str */
Tim Peters6d6c1a32001-08-02 04:15:00 +00007524 PyObject_GenericGetAttr, /* tp_getattro */
7525 0, /* tp_setattro */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007526 &unicode_as_buffer, /* tp_as_buffer */
Neil Schemenauerce30bc92002-11-18 16:10:18 +00007527 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_CHECKTYPES |
7528 Py_TPFLAGS_BASETYPE, /* tp_flags */
Tim Peters6d6c1a32001-08-02 04:15:00 +00007529 unicode_doc, /* tp_doc */
7530 0, /* tp_traverse */
7531 0, /* tp_clear */
7532 0, /* tp_richcompare */
7533 0, /* tp_weaklistoffset */
7534 0, /* tp_iter */
7535 0, /* tp_iternext */
7536 unicode_methods, /* tp_methods */
7537 0, /* tp_members */
7538 0, /* tp_getset */
Guido van Rossumcacfc072002-05-24 19:01:59 +00007539 &PyBaseString_Type, /* tp_base */
Tim Peters6d6c1a32001-08-02 04:15:00 +00007540 0, /* tp_dict */
7541 0, /* tp_descr_get */
7542 0, /* tp_descr_set */
7543 0, /* tp_dictoffset */
7544 0, /* tp_init */
7545 0, /* tp_alloc */
7546 unicode_new, /* tp_new */
Neil Schemenauer58aa8612002-04-12 03:07:20 +00007547 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007548};
7549
7550/* Initialize the Unicode implementation */
7551
Thomas Wouters78890102000-07-22 19:25:51 +00007552void _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007553{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00007554 int i;
7555
Fredrik Lundhb63588c2006-05-23 18:44:25 +00007556 /* XXX - move this array to unicodectype.c ? */
7557 Py_UNICODE linebreak[] = {
7558 0x000A, /* LINE FEED */
7559 0x000D, /* CARRIAGE RETURN */
7560 0x001C, /* FILE SEPARATOR */
7561 0x001D, /* GROUP SEPARATOR */
7562 0x001E, /* RECORD SEPARATOR */
7563 0x0085, /* NEXT LINE */
7564 0x2028, /* LINE SEPARATOR */
7565 0x2029, /* PARAGRAPH SEPARATOR */
7566 };
7567
Fred Drakee4315f52000-05-09 19:53:39 +00007568 /* Init the implementation */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00007569 unicode_freelist = NULL;
7570 unicode_freelist_size = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007571 unicode_empty = _PyUnicode_New(0);
Marc-André Lemburg90e81472000-06-07 09:13:21 +00007572 strcpy(unicode_default_encoding, "ascii");
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00007573 for (i = 0; i < 256; i++)
7574 unicode_latin1[i] = NULL;
Guido van Rossumcacfc072002-05-24 19:01:59 +00007575 if (PyType_Ready(&PyUnicode_Type) < 0)
7576 Py_FatalError("Can't initialize 'unicode'");
Fredrik Lundhb63588c2006-05-23 18:44:25 +00007577
7578 /* initialize the linebreak bloom filter */
7579 bloom_linebreak = make_bloom_mask(
7580 linebreak, sizeof(linebreak) / sizeof(linebreak[0])
7581 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007582}
7583
7584/* Finalize the Unicode implementation */
7585
7586void
Thomas Wouters78890102000-07-22 19:25:51 +00007587_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007588{
Barry Warsaw5b4c2282000-10-03 20:45:26 +00007589 PyUnicodeObject *u;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00007590 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007591
Guido van Rossum4ae8ef82000-10-03 18:09:04 +00007592 Py_XDECREF(unicode_empty);
7593 unicode_empty = NULL;
Barry Warsaw5b4c2282000-10-03 20:45:26 +00007594
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00007595 for (i = 0; i < 256; i++) {
7596 if (unicode_latin1[i]) {
7597 Py_DECREF(unicode_latin1[i]);
7598 unicode_latin1[i] = NULL;
7599 }
7600 }
7601
Barry Warsaw5b4c2282000-10-03 20:45:26 +00007602 for (u = unicode_freelist; u != NULL;) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007603 PyUnicodeObject *v = u;
7604 u = *(PyUnicodeObject **)u;
Guido van Rossumfd4b9572000-04-10 13:51:10 +00007605 if (v->str)
Guido van Rossumb18618d2000-05-03 23:44:39 +00007606 PyMem_DEL(v->str);
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00007607 Py_XDECREF(v->defenc);
Neil Schemenauer58aa8612002-04-12 03:07:20 +00007608 PyObject_Del(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007609 }
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00007610 unicode_freelist = NULL;
7611 unicode_freelist_size = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007612}
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00007613
Anthony Baxterac6bd462006-04-13 02:06:09 +00007614#ifdef __cplusplus
7615}
7616#endif
7617
7618
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00007619/*
7620Local variables:
7621c-basic-offset: 4
7622indent-tabs-mode: nil
7623End:
7624*/